aboutsummaryrefslogtreecommitdiff
path: root/drivers/usb/serial/bus.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/usb/serial/bus.c')
-rw-r--r--drivers/usb/serial/bus.c14
1 files changed, 11 insertions, 3 deletions
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 35a2373cde6..9374bd2aba2 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -97,13 +97,19 @@ static int usb_serial_device_remove(struct device *dev)
struct usb_serial_port *port;
int retval = 0;
int minor;
+ int autopm_err;
port = to_usb_serial_port(dev);
if (!port)
return -ENODEV;
- /* make sure suspend/resume doesn't race against port_remove */
- usb_autopm_get_interface(port->serial->interface);
+ /*
+ * Make sure suspend/resume doesn't race against port_remove.
+ *
+ * Note that no further runtime PM callbacks will be made if
+ * autopm_get fails.
+ */
+ autopm_err = usb_autopm_get_interface(port->serial->interface);
minor = port->minor;
tty_unregister_device(usb_serial_tty_driver, minor);
@@ -117,7 +123,9 @@ static int usb_serial_device_remove(struct device *dev)
dev_info(dev, "%s converter now disconnected from ttyUSB%d\n",
driver->description, minor);
- usb_autopm_put_interface(port->serial->interface);
+ if (!autopm_err)
+ usb_autopm_put_interface(port->serial->interface);
+
return retval;
}
='add' style='width: 10.2%;'/> -rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c311
-rw-r--r--net/netfilter/ipset/ip_set_core.c2011
-rw-r--r--net/netfilter/ipset/ip_set_getport.c174
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h1160
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c315
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c321
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c390
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c402
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c561
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c397
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c610
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c481
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c509
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c587
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c685
-rw-r--r--net/netfilter/ipset/pfxlen.c313
-rw-r--r--net/netfilter/ipvs/Kconfig281
-rw-r--r--net/netfilter/ipvs/Makefile40
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c627
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c1376
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2105
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c3909
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c276
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c211
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c501
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c633
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c818
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c93
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c299
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c142
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c111
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c171
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c409
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c165
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c591
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c712
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c499
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c130
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c255
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c143
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c385
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1948
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c115
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c270
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1266
-rw-r--r--net/netfilter/nf_conntrack_acct.c133
-rw-r--r--net/netfilter/nf_conntrack_amanda.c241
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c2539
-rw-r--r--net/netfilter/nf_conntrack_ecache.c261
-rw-r--r--net/netfilter/nf_conntrack_expect.c656
-rw-r--r--net/netfilter/nf_conntrack_extend.c191
-rw-r--r--net/netfilter/nf_conntrack_ftp.c540
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c888
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c1904
-rw-r--r--net/netfilter/nf_conntrack_h323_types.c1922
-rw-r--r--net/netfilter/nf_conntrack_helper.c502
-rw-r--r--net/netfilter/nf_conntrack_irc.c292
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c48
-rw-r--r--net/netfilter/nf_conntrack_labels.c108
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c71
-rw-r--r--net/netfilter/nf_conntrack_netlink.c3296
-rw-r--r--net/netfilter/nf_conntrack_pptp.c619
-rw-r--r--net/netfilter/nf_conntrack_proto.c523
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c1006
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c199
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c447
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c966
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1611
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c336
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c405
-rw-r--r--net/netfilter/nf_conntrack_sane.c237
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c243
-rw-r--r--net/netfilter/nf_conntrack_sip.c1680
-rw-r--r--net/netfilter/nf_conntrack_snmp.c78
-rw-r--r--net/netfilter/nf_conntrack_standalone.c949
-rw-r--r--net/netfilter/nf_conntrack_tftp.c153
-rw-r--r--net/netfilter/nf_conntrack_timeout.c51
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c114
-rw-r--r--net/netfilter/nf_internals.h31
-rw-r--r--net/netfilter/nf_log.c358
-rw-r--r--net/netfilter/nf_nat_amanda.c90
-rw-r--r--net/netfilter/nf_nat_core.c898
-rw-r--r--net/netfilter/nf_nat_ftp.c146
-rw-r--r--net/netfilter/nf_nat_helper.c212
-rw-r--r--net/netfilter/nf_nat_irc.c119
-rw-r--r--net/netfilter/nf_nat_proto_common.c114
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c116
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c96
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c85
-rw-r--r--net/netfilter/nf_nat_proto_udp.c76
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c106
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/netfilter/nf_nat_sip.c653
-rw-r--r--net/netfilter/nf_nat_tftp.c52
-rw-r--r--net/netfilter/nf_queue.c425
-rw-r--r--net/netfilter/nf_sockopt.c151
-rw-r--r--net/netfilter/nf_synproxy_core.c434
-rw-r--r--net/netfilter/nf_tables_api.c4041
-rw-r--r--net/netfilter/nf_tables_core.c271
-rw-r--r--net/netfilter/nf_tables_inet.c104
-rw-r--r--net/netfilter/nfnetlink.c580
-rw-r--r--net/netfilter/nfnetlink_acct.c454
-rw-r--r--net/netfilter/nfnetlink_cthelper.c680
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c585
-rw-r--r--net/netfilter/nfnetlink_log.c965
-rw-r--r--net/netfilter/nfnetlink_queue.c1132
-rw-r--r--net/netfilter/nfnetlink_queue_core.c1352
-rw-r--r--net/netfilter/nfnetlink_queue_ct.c113
-rw-r--r--net/netfilter/nft_bitwise.c146
-rw-r--r--net/netfilter/nft_byteorder.c173
-rw-r--r--net/netfilter/nft_cmp.c223
-rw-r--r--net/netfilter/nft_compat.c793
-rw-r--r--net/netfilter/nft_counter.c113
-rw-r--r--net/netfilter/nft_ct.c421
-rw-r--r--net/netfilter/nft_expr_template.c94
-rw-r--r--net/netfilter/nft_exthdr.c133
-rw-r--r--net/netfilter/nft_hash.c433
-rw-r--r--net/netfilter/nft_immediate.c133
-rw-r--r--net/netfilter/nft_limit.c119
-rw-r--r--net/netfilter/nft_log.c144
-rw-r--r--net/netfilter/nft_lookup.c149
-rw-r--r--net/netfilter/nft_meta.c334
-rw-r--r--net/netfilter/nft_nat.c224
-rw-r--r--net/netfilter/nft_payload.c161
-rw-r--r--net/netfilter/nft_queue.c132
-rw-r--r--net/netfilter/nft_rbtree.c294
-rw-r--r--net/netfilter/nft_reject.c74
-rw-r--r--net/netfilter/nft_reject_inet.c63
-rw-r--r--net/netfilter/x_tables.c1191
-rw-r--r--net/netfilter/xt_AUDIT.c231
-rw-r--r--net/netfilter/xt_CHECKSUM.c70
-rw-r--r--net/netfilter/xt_CLASSIFY.c106
-rw-r--r--net/netfilter/xt_CONNMARK.c141
-rw-r--r--net/netfilter/xt_CONNSECMARK.c143
-rw-r--r--net/netfilter/xt_CT.c444
-rw-r--r--net/netfilter/xt_DSCP.c164
-rw-r--r--net/netfilter/xt_HL.c169
-rw-r--r--net/netfilter/xt_HMARK.c372
-rw-r--r--net/netfilter/xt_IDLETIMER.c315
-rw-r--r--net/netfilter/xt_LED.c215
-rw-r--r--net/netfilter/xt_LOG.c975
-rw-r--r--net/netfilter/xt_MARK.c191
-rw-r--r--net/netfilter/xt_NETMAP.c165
-rw-r--r--net/netfilter/xt_NFLOG.c73
-rw-r--r--net/netfilter/xt_NFQUEUE.c175
-rw-r--r--net/netfilter/xt_NOTRACK.c92
-rw-r--r--net/netfilter/xt_RATEEST.c194
-rw-r--r--net/netfilter/xt_REDIRECT.c190
-rw-r--r--net/netfilter/xt_SECMARK.c147
-rw-r--r--net/netfilter/xt_TCPMSS.c344
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c158
-rw-r--r--net/netfilter/xt_TEE.c309
-rw-r--r--net/netfilter/xt_TPROXY.c599
-rw-r--r--net/netfilter/xt_TRACE.c40
-rw-r--r--net/netfilter/xt_addrtype.c248
-rw-r--r--net/netfilter/xt_bpf.c74
-rw-r--r--net/netfilter/xt_cgroup.c72
-rw-r--r--net/netfilter/xt_cluster.c178
-rw-r--r--net/netfilter/xt_comment.c69
-rw-r--r--net/netfilter/xt_connbytes.c195
-rw-r--r--net/netfilter/xt_connlabel.c99
-rw-r--r--net/netfilter/xt_connlimit.c484
-rw-r--r--net/netfilter/xt_connmark.c193
-rw-r--r--net/netfilter/xt_conntrack.c459
-rw-r--r--net/netfilter/xt_cpu.c65
-rw-r--r--net/netfilter/xt_dccp.c195
-rw-r--r--net/netfilter/xt_devgroup.c82
-rw-r--r--net/netfilter/xt_dscp.c115
-rw-r--r--net/netfilter/xt_ecn.c179
-rw-r--r--net/netfilter/xt_esp.c107
-rw-r--r--net/netfilter/xt_hashlimit.c968
-rw-r--r--net/netfilter/xt_helper.c183
-rw-r--r--net/netfilter/xt_hl.c96
-rw-r--r--net/netfilter/xt_ipcomp.c111
-rw-r--r--net/netfilter/xt_iprange.c140
-rw-r--r--net/netfilter/xt_ipvs.c188
-rw-r--r--net/netfilter/xt_l2tp.c354
-rw-r--r--net/netfilter/xt_length.c93
-rw-r--r--net/netfilter/xt_limit.c187
-rw-r--r--net/netfilter/xt_mac.c94
-rw-r--r--net/netfilter/xt_mark.c107
-rw-r--r--net/netfilter/xt_multiport.c165
-rw-r--r--net/netfilter/xt_nat.c170
-rw-r--r--net/netfilter/xt_nfacct.c79
-rw-r--r--net/netfilter/xt_osf.c427
-rw-r--r--net/netfilter/xt_owner.c100
-rw-r--r--net/netfilter/xt_physdev.c133
-rw-r--r--net/netfilter/xt_pkttype.c81
-rw-r--r--net/netfilter/xt_policy.c188
-rw-r--r--net/netfilter/xt_quota.c90
-rw-r--r--net/netfilter/xt_rateest.c157
-rw-r--r--net/netfilter/xt_realm.c61
-rw-r--r--net/netfilter/xt_recent.c740
-rw-r--r--net/netfilter/xt_repldata.h47
-rw-r--r--net/netfilter/xt_sctp.c268
-rw-r--r--net/netfilter/xt_set.c523
-rw-r--r--net/netfilter/xt_socket.c495
-rw-r--r--net/netfilter/xt_state.c91
-rw-r--r--net/netfilter/xt_statistic.c101
-rw-r--r--net/netfilter/xt_string.c105
-rw-r--r--net/netfilter/xt_tcpmss.c152
-rw-r--r--net/netfilter/xt_tcpudp.c330
-rw-r--r--net/netfilter/xt_time.c291
-rw-r--r--net/netfilter/xt_u32.c123
213 files changed, 81329 insertions, 8945 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a8e5544da93..e9410d17619 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,22 +1,29 @@
menu "Core Netfilter Configuration"
- depends on NET && NETFILTER
+ depends on NET && INET && NETFILTER
config NETFILTER_NETLINK
- tristate "Netfilter netlink interface"
- help
- If this option is enabled, the kernel will include support
- for the new netfilter netlink interface.
+ tristate
+
+config NETFILTER_NETLINK_ACCT
+tristate "Netfilter NFACCT over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for extended accounting via NFNETLINK.
config NETFILTER_NETLINK_QUEUE
tristate "Netfilter NFQUEUE over NFNETLINK interface"
- depends on NETFILTER_NETLINK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
help
- If this option isenabled, the kernel will include support
+ If this option is enabled, the kernel will include support
for queueing packets via NFNETLINK.
config NETFILTER_NETLINK_LOG
tristate "Netfilter LOG over NFNETLINK interface"
- depends on NETFILTER_NETLINK
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_NETLINK
help
If this option is enabled, the kernel will include support
for logging packets via NFNETLINK.
@@ -26,64 +33,155 @@ config NETFILTER_NETLINK_LOG
and ip6t_LOG modules.
config NF_CONNTRACK
- tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)"
- depends on EXPERIMENTAL && IP_NF_CONNTRACK=n
- default n
- ---help---
+ tristate "Netfilter connection tracking support"
+ default m if NETFILTER_ADVANCED=n
+ help
Connection tracking keeps a record of what packets have passed
through your machine, in order to figure out how they are related
into connections.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
+ This is required to do Masquerading or other kinds of Network
+ Address Translation. It can also be used to enhance packet
+ filtering (see `Connection state match support' below).
To compile it as a module, choose M here. If unsure, say N.
-config NF_CT_ACCT
- bool "Connection tracking flow accounting"
- depends on NF_CONNTRACK
- help
- If this option is enabled, the connection tracking code will
- keep per-flow packet and byte counters.
-
- Those counters can be used for flow-based accounting or the
- `connbytes' match.
-
- If unsure, say `N'.
+if NF_CONNTRACK
config NF_CONNTRACK_MARK
bool 'Connection mark tracking support'
- depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
help
This option enables support for connection marks, used by the
`CONNMARK' target and `connmark' match. Similar to the mark value
of packets, but this mark value is kept in the conntrack session
instead of the individual packets.
+config NF_CONNTRACK_SECMARK
+ bool 'Connection tracking security mark support'
+ depends on NETWORK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enables security markings to be applied to
+ connections. Typically they are copied to connections from
+ packets using the CONNSECMARK target and copied back from
+ connections to packets with the same target, with the packets
+ being originally labeled via SECMARK.
+
+ If unsure, say 'N'.
+
+config NF_CONNTRACK_ZONES
+ bool 'Connection tracking zones'
+ depends on NETFILTER_ADVANCED
+ depends on NETFILTER_XT_TARGET_CT
+ help
+ This option enables support for connection tracking zones.
+ Normally, each connection needs to have a unique system wide
+ identity. Connection tracking zones allow to have multiple
+ connections using the same identity, as long as they are
+ contained in different zones.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_PROCFS
+ bool "Supply CT list in procfs (OBSOLETE)"
+ default y
+ depends on PROC_FS
+ ---help---
+ This option enables for the list of known conntrack entries
+ to be shown in procfs under net/netfilter/nf_conntrack. This
+ is considered obsolete in favor of using the conntrack(8)
+ tool which uses Netlink.
+
config NF_CONNTRACK_EVENTS
- bool "Connection tracking events (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_CONNTRACK
+ bool "Connection tracking events"
+ depends on NETFILTER_ADVANCED
help
If this option is enabled, the connection tracking code will
provide a notifier chain that can be used by other kernel code
- to get notified aboutchanges in the connection tracking state.
+ to get notified about changes in the connection tracking state.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_TIMEOUT
+ bool 'Connection tracking timeout'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ extension. This allows you to attach timeout policies to flow
+ via the CT target.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_TIMESTAMP
+ bool 'Connection tracking timestamping'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timestamping.
+ This allows you to store the flow start-time and to obtain
+ the flow-stop time (once it has been destroyed) via Connection
+ tracking events.
If unsure, say `N'.
+config NF_CONNTRACK_LABELS
+ bool
+ help
+ This option enables support for assigning user-defined flag bits
+ to connection tracking entries. It selected by the connlabel match.
+
+config NF_CT_PROTO_DCCP
+ tristate 'DCCP protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ default IP_DCCP
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on DCCP connections.
+
+ If unsure, say 'N'.
+
+config NF_CT_PROTO_GRE
+ tristate
+
config NF_CT_PROTO_SCTP
- tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)'
- depends on EXPERIMENTAL && NF_CONNTRACK
- default n
+ tristate 'SCTP protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ default IP_SCTP
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
If you want to compile it as a module, say M here and read
- Documentation/modules.txt. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NF_CT_PROTO_UDPLITE
+ tristate 'UDP-Lite protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on UDP-Lite
+ connections.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_AMANDA
+ tristate "Amanda backup protocol support"
+ depends on NETFILTER_ADVANCED
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ help
+ If you are running the Amanda backup package <http://www.amanda.org/>
+ on this machine or machines that will be MASQUERADED through this
+ machine, then you may want to enable this feature. This allows the
+ connection tracking and natting code to allow the sub-channels that
+ Amanda requires for communication of the backup data, messages and
+ index.
+
+ To compile it as a module, choose M here. If unsure, say N.
config NF_CONNTRACK_FTP
- tristate "FTP support on new connection tracking (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_CONNTRACK
+ tristate "FTP protocol support"
+ default m if NETFILTER_ADVANCED=n
help
Tracking FTP connections is problematic: special helpers are
required for tracking them, and doing masquerading and other forms
@@ -95,24 +193,430 @@ config NF_CONNTRACK_FTP
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_H323
+ tristate "H.323 protocol support"
+ depends on (IPV6 || IPV6=n)
+ depends on NETFILTER_ADVANCED
+ help
+ H.323 is a VoIP signalling protocol from ITU-T. As one of the most
+ important VoIP protocols, it is widely used by voice hardware and
+ software including voice gateways, IP phones, Netmeeting, OpenPhone,
+ Gnomemeeting, etc.
+
+ With this module you can support H.323 on a connection tracking/NAT
+ firewall.
+
+ This module supports RAS, Fast Start, H.245 Tunnelling, Call
+ Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+ whiteboard, file transfer, etc. For more information, please
+ visit http://nath323.sourceforge.net/.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_IRC
+ tristate "IRC protocol support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ There is a commonly-used extension to IRC called
+ Direct Client-to-Client Protocol (DCC). This enables users to send
+ files to each other, and also chat to each other without the need
+ of a server. DCC Sending is used anywhere you send files over IRC,
+ and DCC Chat is most commonly used by Eggdrop bots. If you are
+ using NAT, this extension will enable you to send files and initiate
+ chats. Note that you do NOT need this extension to get files or
+ have others initiate chats, or everything else in IRC.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_BROADCAST
+ tristate
+
+config NF_CONNTRACK_NETBIOS_NS
+ tristate "NetBIOS name service protocol support"
+ select NF_CONNTRACK_BROADCAST
+ help
+ NetBIOS name service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating NetBIOS name service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address. When properly configured, the output
+ of "ip address show" should look similar to this:
+
+ $ ip -4 address show eth0
+ 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+ inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SNMP
+ tristate "SNMP service protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
+ help
+ SNMP service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating SNMP service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_PPTP
+ tristate "PPtP protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CT_PROTO_GRE
+ help
+ This module adds support for PPTP (Point to Point Tunnelling
+ Protocol, RFC2637) connection tracking and NAT.
+
+ If you are running PPTP sessions over a stateful firewall or NAT
+ box, you may want to enable this feature.
+
+ Please note that not all PPTP modes of operation are supported yet.
+ Specifically these limitations exist:
+ - Blindly assumes that control connections are always established
+ in PNS->PAC direction. This is a violation of RFC2637.
+ - Only supports a single call within each session
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SANE
+ tristate "SANE protocol support"
+ depends on NETFILTER_ADVANCED
+ help
+ SANE is a protocol for remote access to scanners as implemented
+ by the 'saned' daemon. Like FTP, it uses separate control and
+ data connections.
+
+ With this module you can support SANE on a connection tracking
+ firewall.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SIP
+ tristate "SIP protocol support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ SIP is an application-layer control protocol that can establish,
+ modify, and terminate multimedia sessions (conferences) such as
+ Internet telephony calls. With the ip_conntrack_sip and
+ the nf_nat_sip modules you can support the protocol on a connection
+ tracking/NATing firewall.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_TFTP
+ tristate "TFTP protocol support"
+ depends on NETFILTER_ADVANCED
+ help
+ TFTP connection tracking helper, this is required depending
+ on how restrictive your ruleset is.
+ If you are using a tftp client behind -j SNAT or -j MASQUERADING
+ you will need this.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NF_CT_NETLINK
- tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
- depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK
- depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
+ tristate 'Connection tracking netlink interface'
+ select NETFILTER_NETLINK
+ default m if NETFILTER_ADVANCED=n
help
This option enables support for a netlink-based userspace interface
+config NF_CT_NETLINK_TIMEOUT
+ tristate 'Connection tracking timeout tuning via Netlink'
+ select NETFILTER_NETLINK
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ fine-grain tuning. This allows you to attach specific timeout
+ policies to flows, instead of using the global timeout policy.
+
+ If unsure, say `N'.
+
+config NF_CT_NETLINK_HELPER
+ tristate 'Connection tracking helpers in user-space via Netlink'
+ select NETFILTER_NETLINK
+ depends on NF_CT_NETLINK
+ depends on NETFILTER_NETLINK_QUEUE
+ depends on NETFILTER_NETLINK_QUEUE_CT
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables the user-space connection tracking helpers
+ infrastructure.
+
+ If unsure, say `N'.
+
+config NETFILTER_NETLINK_QUEUE_CT
+ bool "NFQUEUE integration with Connection Tracking"
+ default n
+ depends on NETFILTER_NETLINK_QUEUE
+ help
+ If this option is enabled, NFQUEUE can include Connection Tracking
+ information together with the packet is the enqueued via NFNETLINK.
+
+config NF_NAT
+ tristate
+
+config NF_NAT_NEEDED
+ bool
+ depends on NF_NAT
+ default y
+
+config NF_NAT_PROTO_DCCP
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_DCCP
+ default NF_NAT && NF_CT_PROTO_DCCP
+
+config NF_NAT_PROTO_UDPLITE
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_UDPLITE
+ default NF_NAT && NF_CT_PROTO_UDPLITE
+
+config NF_NAT_PROTO_SCTP
+ tristate
+ default NF_NAT && NF_CT_PROTO_SCTP
+ depends on NF_NAT && NF_CT_PROTO_SCTP
+ select LIBCRC32C
+
+config NF_NAT_AMANDA
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_AMANDA
+
+config NF_NAT_FTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_FTP
+
+config NF_NAT_IRC
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_IRC
+
+config NF_NAT_SIP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_SIP
+
+config NF_NAT_TFTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_TFTP
+
+config NETFILTER_SYNPROXY
+ tristate
+
+endif # NF_CONNTRACK
+
+config NF_TABLES
+ select NETFILTER_NETLINK
+ tristate "Netfilter nf_tables support"
+ help
+ nftables is the new packet classification framework that intends to
+ replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
+ provides a pseudo-state machine with an extensible instruction-set
+ (also known as expressions) that the userspace 'nft' utility
+ (http://www.netfilter.org/projects/nftables) uses to build the
+ rule-set. It also comes with the generic set infrastructure that
+ allows you to construct mappings between matchings and actions
+ for performance lookups.
+
+ To compile it as a module, choose M here.
+
+config NF_TABLES_INET
+ depends on NF_TABLES && IPV6
+ select NF_TABLES_IPV4
+ select NF_TABLES_IPV6
+ tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support"
+ help
+ This option enables support for a mixed IPv4/IPv6 "inet" table.
+
+config NFT_EXTHDR
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables IPv6 exthdr module"
+ help
+ This option adds the "exthdr" expression that you can use to match
+ IPv6 extension headers.
+
+config NFT_META
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables meta module"
+ help
+ This option adds the "meta" expression that you can use to match and
+ to set packet metainformation such as the packet mark.
+
+config NFT_CT
+ depends on NF_TABLES
+ depends on NF_CONNTRACK
+ tristate "Netfilter nf_tables conntrack module"
+ help
+ This option adds the "meta" expression that you can use to match
+ connection tracking information such as the flow state.
+
+config NFT_RBTREE
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables rbtree set module"
+ help
+ This option adds the "rbtree" set type (Red Black tree) that is used
+ to build interval-based sets.
+
+config NFT_HASH
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables hash set module"
+ help
+ This option adds the "hash" set type that is used to build one-way
+ mappings between matchings and actions.
+
+config NFT_COUNTER
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables counter module"
+ help
+ This option adds the "counter" expression that you can use to
+ include packet and byte counters in a rule.
+
+config NFT_LOG
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables log module"
+ help
+ This option adds the "log" expression that you can use to log
+ packets matching some criteria.
+
+config NFT_LIMIT
+ depends on NF_TABLES
+ tristate "Netfilter nf_tables limit module"
+ help
+ This option adds the "limit" expression that you can use to
+ ratelimit rule matchings.
+
+config NFT_NAT
+ depends on NF_TABLES
+ depends on NF_CONNTRACK
+ depends on NF_NAT
+ tristate "Netfilter nf_tables nat module"
+ help
+ This option adds the "nat" expression that you can use to perform
+ typical Network Address Translation (NAT) packet transformations.
+
+config NFT_QUEUE
+ depends on NF_TABLES
+ depends on NETFILTER_XTABLES
+ depends on NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter nf_tables queue module"
+ help
+ This is required if you intend to use the userspace queueing
+ infrastructure (also known as NFQUEUE) from nftables.
+
+config NFT_REJECT
+ depends on NF_TABLES
+ default m if NETFILTER_ADVANCED=n
+ tristate "Netfilter nf_tables reject support"
+ help
+ This option adds the "reject" expression that you can use to
+ explicitly deny and notify via TCP reset/ICMP informational errors
+ unallowed traffic.
+
+config NFT_REJECT_INET
+ depends on NF_TABLES_INET
+ default NFT_REJECT
+ tristate
+
+config NFT_COMPAT
+ depends on NF_TABLES
+ depends on NETFILTER_XTABLES
+ tristate "Netfilter x_tables over nf_tables module"
+ help
+ This is required if you intend to use any of existing
+ x_tables match/target extensions over the nf_tables
+ framework.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
+ default m if NETFILTER_ADVANCED=n
help
This is required if you intend to use any of ip_tables,
ip6_tables or arp_tables.
+if NETFILTER_XTABLES
+
+comment "Xtables combined modules"
+
+config NETFILTER_XT_MARK
+ tristate 'nfmark target and match support'
+ default m if NETFILTER_ADVANCED=n
+ ---help---
+ This option adds the "MARK" target and "mark" match.
+
+ Netfilter mark matching allows you to match packets based on the
+ "nfmark" value in the packet.
+ The target allows you to create rules in the "mangle" table which alter
+ the netfilter mark (nfmark) field associated with the packet.
+
+ Prior to routing, the nfmark can influence the routing method (see
+ "Use netfilter MARK value as routing key") and can also be used by
+ other subsystems to change their behavior.
+
+config NETFILTER_XT_CONNMARK
+ tristate 'ctmark target and match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_MARK
+ ---help---
+ This option adds the "CONNMARK" target and "connmark" match.
+
+ Netfilter allows you to store a mark value per connection (a.k.a.
+ ctmark), similarly to the packet mark (nfmark). Using this
+ target and match, you can set and match on this mark.
+
+config NETFILTER_XT_SET
+ tristate 'set target and match support'
+ depends on IP_SET
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds the "SET" target and "set" match.
+
+ Using this target and match, you can add/delete and match
+ elements in the sets created by ipset(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# alphabetically ordered list of targets
+comment "Xtables targets"
+
+config NETFILTER_XT_TARGET_AUDIT
+ tristate "AUDIT target support"
+ depends on AUDIT
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a 'AUDIT' target, which can be used to create
+ audit records for packets dropped/accepted.
+
+ To compileit as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CHECKSUM
+ tristate "CHECKSUM target support"
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to compute and fill in the checksum in
+ a packet that lacks a checksum. This is particularly useful,
+ if you need to work around old applications such as dhcp clients,
+ that do not work well with checksum offloads, but don't want to disable
+ checksum offload in your device.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_CLASSIFY
tristate '"CLASSIFY" target support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
This option adds a `CLASSIFY' target, which enables the user to set
the priority of a packet. Some qdiscs can use this value for
@@ -124,36 +628,164 @@ config NETFILTER_XT_TARGET_CLASSIFY
config NETFILTER_XT_TARGET_CONNMARK
tristate '"CONNMARK" target support'
- depends on NETFILTER_XTABLES
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_CONNMARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
+
+config NETFILTER_XT_TARGET_CONNSECMARK
+ tristate '"CONNSECMARK" target support'
+ depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ The CONNSECMARK target copies security markings from packets
+ to connections, and restores security markings from connections
+ to packets (if the packets are not already marked). This would
+ normally be used in conjunction with the SECMARK target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CT
+ tristate '"CT" target support'
+ depends on NF_CONNTRACK
+ depends on IP_NF_RAW || IP6_NF_RAW
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `CT' target, which allows to specify initial
+ connection tracking parameters like events to be delivered and
+ the helper to be used.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_DSCP
+ tristate '"DSCP" and "TOS" target support'
depends on IP_NF_MANGLE || IP6_NF_MANGLE
- depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK)
+ depends on NETFILTER_ADVANCED
help
- This option adds a `CONNMARK' target, which allows one to manipulate
- the connection mark value. Similar to the MARK target, but
- affects the connection mark value rather than the packet mark value.
-
- If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. The module will be called
- ipt_CONNMARK.o. If unsure, say `N'.
+ This option adds a `DSCP' target, which allows you to manipulate
+ the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+ The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+ It also adds the "TOS" target, which allows you to create rules in
+ the "mangle" table which alter the Type Of Service field of an IPv4
+ or the Priority field of an IPv6 packet, prior to routing.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_HL
+ tristate '"HL" hoplimit target support'
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
+ targets, which enable the user to change the
+ hoplimit/time-to-live value of the IP header.
+
+ While it is safe to decrement the hoplimit/TTL value, the
+ modules also allow to increment and set the hoplimit value of
+ the header to arbitrary values. This is EXTREMELY DANGEROUS
+ since you can easily create immortal packets that loop
+ forever on the network.
+
+config NETFILTER_XT_TARGET_HMARK
+ tristate '"HMARK" target support'
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds the "HMARK" target.
+
+ The target allows you to create rules in the "raw" and "mangle" tables
+ which set the skbuff mark by means of hash calculation within a given
+ range. The nfmark can influence the routing method (see "Use netfilter
+ MARK value as routing key") and can also be used by other subsystems to
+ change their behaviour.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_IDLETIMER
+ tristate "IDLETIMER target support"
+ depends on NETFILTER_ADVANCED
+ help
+
+ This option adds the `IDLETIMER' target. Each matching packet
+ resets the timer associated with label specified when the rule is
+ added. When the timer expires, it triggers a sysfs notification.
+ The remaining time for expiration can be read via sysfs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_LED
+ tristate '"LED" target support'
+ depends on LEDS_CLASS && LEDS_TRIGGERS
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `LED' target, which allows you to blink LEDs in
+ response to particular packets passing through your machine.
+
+ This can be used to turn a spare LED into a network activity LED,
+ which only flashes in response to FTP transfers, for example. Or
+ you could have an LED which lights up for a minute or two every time
+ somebody connects to your machine via SSH.
+
+ You will need support for the "led" class to make this work.
+
+ To create an LED trigger for incoming SSH traffic:
+ iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000
+
+ Then attach the new trigger to an LED on your system:
+ echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
+
+ For more information on the LEDs available on your system, see
+ Documentation/leds/leds-class.txt
+
+config NETFILTER_XT_TARGET_LOG
+ tristate "LOG target support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `LOG' target, which allows you to create rules in
+ any iptables table which records the packet header to the syslog.
+
+ To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_MARK
tristate '"MARK" target support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_TARGET_NETMAP
+ tristate '"NETMAP" target support'
+ depends on NF_NAT
+ ---help---
+ NETMAP is an implementation of static 1:1 NAT mapping of network
+ addresses. It maps the network address part, while keeping the host
+ address part intact.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_NFLOG
+ tristate '"NFLOG" target support'
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_NETLINK_LOG
help
- This option adds a `MARK' target, which allows you to create rules
- in the `mangle' table which alter the netfilter mark (nfmark) field
- associated with the packet prior to routing. This can change
- the routing method (see `Use netfilter MARK value as routing
- key') and can also be used by other subsystems to change their
- behavior.
+ This option enables the NFLOG target, which allows to LOG
+ messages through nfnetlink_log.
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_NFQUEUE
tristate '"NFQUEUE" target Support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_QUEUE
help
- This Target replaced the old obsolete QUEUE target.
+ This target replaced the old obsolete QUEUE target.
As opposed to QUEUE, it supports 65535 different queues,
not just one.
@@ -161,56 +793,222 @@ config NETFILTER_XT_TARGET_NFQUEUE
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_TARGET_NOTRACK
- tristate '"NOTRACK" target support'
+ tristate '"NOTRACK" target support (DEPRECATED)'
+ depends on NF_CONNTRACK
+ depends on IP_NF_RAW || IP6_NF_RAW
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_CT
+
+config NETFILTER_XT_TARGET_RATEEST
+ tristate '"RATEEST" target support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `RATEEST' target, which allows to measure
+ rates similar to TC estimators. The `rateest' match can be
+ used to match on the measured rates.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NF_NAT
+ ---help---
+ REDIRECT is a special case of NAT: all incoming connections are
+ mapped onto the incoming interface's address, causing the packets to
+ come to the local machine instead of passing through. This is
+ useful for transparent proxies.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TEE
+ tristate '"TEE" - packet cloning to alternate destination'
+ depends on NETFILTER_ADVANCED
+ depends on (IPV6 || IPV6=n)
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ ---help---
+ This option adds a "TEE" target with which a packet can be cloned and
+ this clone be rerouted to another nexthop.
+
+config NETFILTER_XT_TARGET_TPROXY
+ tristate '"TPROXY" target transparent proxying support'
depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ help
+ This option adds a `TPROXY' target, which is somewhat similar to
+ REDIRECT. It can only be used in the mangle table and is useful
+ to redirect traffic to a transparent proxy. It does _not_ depend
+ on Netfilter connection tracking and NAT, unlike REDIRECT.
+ For it to work you will have to configure certain iptables rules
+ and use policy routing. For more information on how to set it up
+ see Documentation/networking/tproxy.txt.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TRACE
+ tristate '"TRACE" target support'
depends on IP_NF_RAW || IP6_NF_RAW
- depends on IP_NF_CONNTRACK || NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ help
+ The TRACE target allows you to mark packets so that the kernel
+ will log every rule which match the packets as those traverse
+ the tables, chains, rules.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_TARGET_SECMARK
+ tristate '"SECMARK" target support'
+ depends on NETWORK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ The SECMARK target allows security marking of network
+ packets, for use with security subsystems.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPMSS
+ tristate '"TCPMSS" target support'
+ depends on (IPV6 || IPV6=n)
+ default m if NETFILTER_ADVANCED=n
+ ---help---
+ This option adds a `TCPMSS' target, which allows you to alter the
+ MSS value of TCP SYN packets, to control the maximum size for that
+ connection (usually limiting it to your outgoing interface's MTU
+ minus 40).
+
+ This is used to overcome criminally braindead ISPs or servers which
+ block ICMP Fragmentation Needed packets. The symptoms of this
+ problem are that everything works fine from your Linux
+ firewall/router, but machines behind it can never exchange large
+ packets:
+ 1) Web browsers connect, then hang with no data received.
+ 2) Small mail works fine, but large emails hang.
+ 3) ssh works fine, but scp hangs after initial handshaking.
+
+ Workaround: activate this option and add a rule to your firewall
+ configuration like:
+
+ iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+ -j TCPMSS --clamp-mss-to-pmtu
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPOPTSTRIP
+ tristate '"TCPOPTSTRIP" target support'
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
help
- The NOTRACK target allows a select rule to specify
- which packets *not* to enter the conntrack/NAT
- subsystem with all the consequences (no ICMP error tracking,
- no protocol helpers for the selected packets).
-
+ This option adds a "TCPOPTSTRIP" target, which allows you to strip
+ TCP options from TCP packets.
+
+# alphabetically ordered list of matches
+
+comment "Xtables matches"
+
+config NETFILTER_XT_MATCH_ADDRTYPE
+ tristate '"addrtype" address type match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option allows you to match what routing thinks of an address,
+ eg. UNICAST, LOCAL, BROADCAST, ...
+
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_BPF
+ tristate '"bpf" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ BPF matching applies a linux socket filter to each packet and
+ accepts those for which the filter returns non-zero.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_CGROUP
+ tristate '"control group" match support'
+ depends on NETFILTER_ADVANCED
+ depends on CGROUPS
+ select CGROUP_NET_CLASSID
+ ---help---
+ Socket/process control group matching allows you to match locally
+ generated packets based on which net_cls control group processes
+ belong to.
+
+config NETFILTER_XT_MATCH_CLUSTER
+ tristate '"cluster" match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option allows you to build work-load-sharing clusters of
+ network servers/stateful firewalls without having a dedicated
+ load-balancing router/server/switch. Basically, this match returns
+ true when the packet must be handled by this cluster node. Thus,
+ all nodes see all packets and this match decides which node handles
+ what packets. The work-load sharing algorithm is based on source
+ address hashing.
+
+ If you say Y or M here, try `iptables -m cluster --help` for
+ more information.
config NETFILTER_XT_MATCH_COMMENT
tristate '"comment" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
This option adds a `comment' dummy-match, which allows you to put
comments in your iptables ruleset.
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
config NETFILTER_XT_MATCH_CONNBYTES
tristate '"connbytes" per-connection counter match support'
- depends on NETFILTER_XTABLES
- depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK)
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
help
This option adds a `connbytes' match, which allows you to match the
number of bytes and/or packets for each direction within a connection.
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNLABEL
+ tristate '"connlabel" match support'
+ select NF_CONNTRACK_LABELS
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match allows you to test and assign userspace-defined labels names
+ to a connection. The kernel only stores bit values - mapping
+ names to bits is done by userspace.
+
+ Unlike connmark, more than 32 flag bits may be assigned to a
+ connection simultaneously.
+
+config NETFILTER_XT_MATCH_CONNLIMIT
+ tristate '"connlimit" match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match allows you to match against the number of parallel
+ connections to a server per client IP address (or address block).
config NETFILTER_XT_MATCH_CONNMARK
tristate '"connmark" connection mark match support'
- depends on NETFILTER_XTABLES
- depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK)
- help
- This option adds a `connmark' match, which allows you to match the
- connection mark value previously set for the session by `CONNMARK'.
-
- If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. The module will be called
- ipt_connmark.o. If unsure, say `N'.
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_CONNMARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
config NETFILTER_XT_MATCH_CONNTRACK
tristate '"conntrack" connection tracking match support'
- depends on NETFILTER_XTABLES
- depends on IP_NF_CONNTRACK || NF_CONNTRACK
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
help
This is a general conntrack match module, a superset of the state match.
@@ -220,30 +1018,144 @@ config NETFILTER_XT_MATCH_CONNTRACK
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_CPU
+ tristate '"cpu" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ CPU matching allows you to match packets based on the CPU
+ currently handling the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_DCCP
- tristate '"DCCP" protocol match support'
- depends on NETFILTER_XTABLES
+ tristate '"dccp" protocol match support'
+ depends on NETFILTER_ADVANCED
+ default IP_DCCP
help
With this option enabled, you will be able to use the iptables
`dccp' match in order to match on DCCP source/destination ports
and DCCP flags.
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_DEVGROUP
+ tristate '"devgroup" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `devgroup' match, which allows to match on the
+ device group a network device is assigned to.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_DSCP
+ tristate '"dscp" and "tos" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `DSCP' match, which allows you to match against
+ the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+ The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+ It will also add a "tos" match, which allows you to match packets
+ based on the Type Of Service fields of the IPv4 packet (which share
+ the same bits as DSCP).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds an "ECN" match, which allows you to match against
+ the IPv4 and TCP header ECN fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_ESP
+ tristate '"esp" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of SPIs
+ inside ESP header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_HASHLIMIT
+ tristate '"hashlimit" match support'
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `hashlimit' match.
+
+ As opposed to `limit', this match dynamically creates a hash table
+ of limit buckets, based on your selection of source/destination
+ addresses and/or ports.
+
+ It enables you to express policies like `10kpps for any given
+ destination address' or `500pps from any given source address'
+ with a single rule.
config NETFILTER_XT_MATCH_HELPER
tristate '"helper" match support'
- depends on NETFILTER_XTABLES
- depends on IP_NF_CONNTRACK || NF_CONNTRACK
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
help
Helper matching allows you to match packets in dynamic connections
tracked by a conntrack-helper, ie. ip_conntrack_ftp
To compile it as a module, choose M here. If unsure, say Y.
+config NETFILTER_XT_MATCH_HL
+ tristate '"hl" hoplimit/TTL match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ HL matching allows you to match packets based on the hoplimit
+ in the IPv6 header, or the time-to-live field in the IPv4
+ header of the packet.
+
+config NETFILTER_XT_MATCH_IPCOMP
+ tristate '"ipcomp" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of CPIs(16 bits)
+ inside IPComp header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_IPRANGE
+ tristate '"iprange" address range match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a "iprange" match, which allows you to match based on
+ an IP address range. (Normal iptables only matches on single addresses
+ with an optional mask.)
+
+ If unsure, say M.
+
+config NETFILTER_XT_MATCH_IPVS
+ tristate '"ipvs" match support'
+ depends on IP_VS
+ depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK
+ help
+ This option allows you to match against IPVS properties of a packet.
+
+ If unsure, say N.
+
+config NETFILTER_XT_MATCH_L2TP
+ tristate '"l2tp" match support'
+ depends on NETFILTER_ADVANCED
+ default L2TP
+ ---help---
+ This option adds an "L2TP" match, which allows you to match against
+ L2TP protocol header fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_LENGTH
tristate '"length" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
This option allows you to match the length of a packet against a
specific value or range of values.
@@ -252,7 +1164,7 @@ config NETFILTER_XT_MATCH_LENGTH
config NETFILTER_XT_MATCH_LIMIT
tristate '"limit" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
limit matching allows you to control the rate at which a rule can be
matched: mainly useful in combination with the LOG target ("LOG
@@ -262,7 +1174,7 @@ config NETFILTER_XT_MATCH_LIMIT
config NETFILTER_XT_MATCH_MAC
tristate '"mac" address match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
MAC matching allows you to match packets based on the source
Ethernet address of the packet.
@@ -271,17 +1183,69 @@ config NETFILTER_XT_MATCH_MAC
config NETFILTER_XT_MATCH_MARK
tristate '"mark" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_MATCH_MULTIPORT
+ tristate '"multiport" Multiple port match support'
+ depends on NETFILTER_ADVANCED
+ help
+ Multiport matching allows you to match TCP or UDP packets based on
+ a series of source or destination ports: normally a rule can only
+ match a single range of ports.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_NFACCT
+ tristate '"nfacct" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_ACCT
+ help
+ This option allows you to use the extended accounting through
+ nfnetlink_acct.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_OSF
+ tristate '"osf" Passive OS fingerprint match'
+ depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+ help
+ This option selects the Passive OS Fingerprinting match module
+ that allows to passively match the remote operating system by
+ analyzing incoming TCP SYN packets.
+
+ Rules and loading software can be downloaded from
+ http://www.ioremap.net/projects/osf
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_OWNER
+ tristate '"owner" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ Socket owner matching allows you to match locally-generated packets
+ based on who created the socket: the user or group. It is also
+ possible to check whether a socket actually exists.
+
+config NETFILTER_XT_MATCH_POLICY
+ tristate 'IPsec "policy" match support'
+ depends on XFRM
+ default m if NETFILTER_ADVANCED=n
help
- Netfilter mark matching allows you to match packets based on the
- `nfmark' value in the packet. This can be set by the MARK target
- (see below).
+ Policy matching allows you to match packets based on the
+ IPsec policy that was used during decapsulation/will
+ be used during encapsulation.
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_MATCH_PHYSDEV
tristate '"physdev" match support'
- depends on NETFILTER_XTABLES && BRIDGE_NETFILTER
+ depends on BRIDGE && BRIDGE_NETFILTER
+ depends on NETFILTER_ADVANCED
help
Physdev packet matching matches against the physical bridge ports
the IP packet arrived on or will leave by.
@@ -290,7 +1254,7 @@ config NETFILTER_XT_MATCH_PHYSDEV
config NETFILTER_XT_MATCH_PKTTYPE
tristate '"pkttype" packet type match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
Packet type matching allows you to match a packet by
its "class", eg. BROADCAST, MULTICAST, ...
@@ -300,35 +1264,82 @@ config NETFILTER_XT_MATCH_PKTTYPE
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_QUOTA
+ tristate '"quota" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `quota' match, which allows to match on a
+ byte counter.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RATEEST
+ tristate '"rateest" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_RATEEST
+ help
+ This option adds a `rateest' match, which allows to match on the
+ rate estimated by the RATEEST target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_REALM
tristate '"realm" match support'
- depends on NETFILTER_XTABLES
- select NET_CLS_ROUTE
+ depends on NETFILTER_ADVANCED
+ select IP_ROUTE_CLASSID
help
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
-
+
This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
in tc world.
-
+
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RECENT
+ tristate '"recent" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match is used for creating one or many lists of recently
+ used addresses and then matching against that/those list(s).
+
+ Short options are available by using 'iptables -m recent -h'
+ Official Website: <http://snowman.net/projects/ipt_recent/>
config NETFILTER_XT_MATCH_SCTP
tristate '"sctp" protocol match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ default IP_SCTP
help
With this option enabled, you will be able to use the
`sctp' match in order to match on SCTP source/destination ports
and SCTP chunk types.
If you want to compile it as a module, say M here and read
- <file:Documentation/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_SOCKET
+ tristate '"socket" match support'
+ depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ depends on (IPV6 || IPV6=n)
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ help
+ This option adds a `socket' match, which can be used to match
+ packets for which a TCP or UDP socket lookup finds a valid socket.
+ It can be used in combination with the MARK target and policy
+ routing to implement full featured non-locally bound sockets.
+
+ To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_MATCH_STATE
tristate '"state" match support'
- depends on NETFILTER_XTABLES
- depends on IP_NF_CONNTRACK || NF_CONNTRACK
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
help
Connection state matching allows you to match packets based on their
relationship to a tracked connection (ie. previous packets). This
@@ -336,9 +1347,18 @@ config NETFILTER_XT_MATCH_STATE
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_STATISTIC
+ tristate '"statistic" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `statistic' match, which allows you to match
+ on packets periodically or randomly with a given percentage.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_STRING
tristate '"string" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
select TEXTSEARCH
select TEXTSEARCH_KMP
select TEXTSEARCH_BM
@@ -351,7 +1371,7 @@ config NETFILTER_XT_MATCH_STRING
config NETFILTER_XT_MATCH_TCPMSS
tristate '"tcpmss" match support'
- depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
This option adds a `tcpmss' match, which allows you to examine the
MSS value of TCP SYN packets, which control the maximum packet size
@@ -359,5 +1379,37 @@ config NETFILTER_XT_MATCH_TCPMSS
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_TIME
+ tristate '"time" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a "time" match, which allows you to match based on
+ the packet arrival time (at the machine which netfilter is running)
+ on) or departure time/date (for locally generated packets).
+
+ If you say Y here, try `iptables -m time --help` for
+ more information.
+
+ If you want to compile it as a module, say M here.
+ If unsure, say N.
+
+config NETFILTER_XT_MATCH_U32
+ tristate '"u32" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ u32 allows you to extract quantities of up to 4 bytes from a packet,
+ AND them with specified masks, shift them by specified amounts and
+ test whether the results are in any of a set of specified ranges.
+ The specification of what to extract is general enough to skip over
+ headers with lengths stored in the packet, as in IP or TCP header
+ lengths.
+
+ Details and examples are in the kernel module source.
+
+endif # NETFILTER_XTABLES
+
endmenu
+source "net/netfilter/ipset/Kconfig"
+
+source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 746172ebc91..bffdad774da 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,9 +1,17 @@
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
-nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
+
+nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
obj-$(CONFIG_NETFILTER) = netfilter.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
+nfnetlink_queue-y := nfnetlink_queue_core.o
+nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
@@ -11,39 +19,153 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
# SCTP protocol connection tracking
+obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
+obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o
+obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o
# connection tracking helpers
+nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o
+
+obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
+obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
+obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
+obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
+obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
+obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
+obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
+
+nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
+ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+
+obj-$(CONFIG_NF_NAT) += nf_nat.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
+# NAT helpers
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
+obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
+obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
+obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
+obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
+
+# SYNPROXY
+obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+
+# nf_tables
+nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
+nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+
+obj-$(CONFIG_NF_TABLES) += nf_tables.o
+obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
+obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
+obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
+obj-$(CONFIG_NFT_META) += nft_meta.o
+obj-$(CONFIG_NFT_CT) += nft_ct.o
+obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
+obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
+obj-$(CONFIG_NFT_REJECT) += nft_reject.o
+obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o
+obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
+obj-$(CONFIG_NFT_LOG) += nft_log.o
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
+# combos
+obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
+obj-$(CONFIG_NF_NAT) += xt_nat.o
+
# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
-obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
-obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
-obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
# matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
-obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
-obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o
obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
-obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
+# IPVS
+obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1ceb1a6c254..1fbab0cdd30 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -1,16 +1,12 @@
-/* netfilter.c: look after the filters for various protocols.
+/* netfilter.c: look after the filters for various protocols.
* Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
*
* Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
* way.
*
* Rusty Russell (C)2000 -- This code is GPL.
- *
- * February 2000: Modified by James Morris to have 1 queue per protocol.
- * 15-Mar-2000: Added NF_REPEAT --RR.
- * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
+ * Patrick McHardy (c) 2006-2012
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
@@ -23,52 +19,117 @@
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include "nf_internals.h"
-/* In this code, we can be waiting indefinitely for userspace to
- * service a packet if a hook returns NF_QUEUE. We could keep a count
- * of skbuffs queued for userspace, and not deregister a hook unless
- * this is zero, but that sucks. Now, we simply check when the
- * packets come back: if the hook is gone, the packet is discarded. */
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+static DEFINE_MUTEX(afinfo_mutex);
+
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+EXPORT_SYMBOL(nf_afinfo);
+const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ipv6_ops);
+
+int nf_register_afinfo(const struct nf_afinfo *afinfo)
+{
+ int err;
+
+ err = mutex_lock_interruptible(&afinfo_mutex);
+ if (err < 0)
+ return err;
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
+ mutex_unlock(&afinfo_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_afinfo);
+
+void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
+{
+ mutex_lock(&afinfo_mutex);
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL);
+ mutex_unlock(&afinfo_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
+
+struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
EXPORT_SYMBOL(nf_hooks);
-static DEFINE_SPINLOCK(nf_hook_lock);
+
+#if defined(CONFIG_JUMP_LABEL)
+struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks_needed);
+#endif
+
+static DEFINE_MUTEX(nf_hook_mutex);
int nf_register_hook(struct nf_hook_ops *reg)
{
- struct list_head *i;
+ struct nf_hook_ops *elem;
+ int err;
- spin_lock_bh(&nf_hook_lock);
- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
- if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+ err = mutex_lock_interruptible(&nf_hook_mutex);
+ if (err < 0)
+ return err;
+ list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+ if (reg->priority < elem->priority)
break;
}
- list_add_rcu(&reg->list, i->prev);
- spin_unlock_bh(&nf_hook_lock);
-
- synchronize_net();
+ list_add_rcu(&reg->list, elem->list.prev);
+ mutex_unlock(&nf_hook_mutex);
+#if defined(CONFIG_JUMP_LABEL)
+ static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
return 0;
}
EXPORT_SYMBOL(nf_register_hook);
void nf_unregister_hook(struct nf_hook_ops *reg)
{
- spin_lock_bh(&nf_hook_lock);
+ mutex_lock(&nf_hook_mutex);
list_del_rcu(&reg->list);
- spin_unlock_bh(&nf_hook_lock);
-
+ mutex_unlock(&nf_hook_mutex);
+#if defined(CONFIG_JUMP_LABEL)
+ static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
synchronize_net();
}
EXPORT_SYMBOL(nf_unregister_hook);
+int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = nf_register_hook(&reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ nf_unregister_hooks(reg, i);
+ return err;
+}
+EXPORT_SYMBOL(nf_register_hooks);
+
+void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ while (n-- > 0)
+ nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_hooks);
+
unsigned int nf_iterate(struct list_head *head,
- struct sk_buff **skb,
- int hook,
+ struct sk_buff *skb,
+ unsigned int hook,
const struct net_device *indev,
const struct net_device *outdev,
- struct list_head **i,
+ struct nf_hook_ops **elemp,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
@@ -78,27 +139,26 @@ unsigned int nf_iterate(struct list_head *head,
* The caller must not block between calls to this
* function because of risk of continuing from deleted element.
*/
- list_for_each_continue_rcu(*i, head) {
- struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
- if (hook_thresh > elem->priority)
+ list_for_each_entry_continue_rcu((*elemp), head, list) {
+ if (hook_thresh > (*elemp)->priority)
continue;
/* Optimization: we don't need to hold module
- reference here, since function can't sleep. --RR */
- verdict = elem->hook(hook, skb, indev, outdev, okfn);
+ reference here, since function can't sleep. --RR */
+repeat:
+ verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
> NF_MAX_VERDICT)) {
NFDEBUG("Evil return from %p(%u).\n",
- elem->hook, hook);
+ (*elemp)->hook, hook);
continue;
}
#endif
if (verdict != NF_REPEAT)
return verdict;
- *i = (*i)->prev;
+ goto repeat;
}
}
return NF_ACCEPT;
@@ -107,110 +167,164 @@ unsigned int nf_iterate(struct list_head *head,
/* Returns 1 if okfn() needs to be executed by the caller,
* -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
struct net_device *indev,
struct net_device *outdev,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
- struct list_head *elem;
+ struct nf_hook_ops *elem;
unsigned int verdict;
int ret = 0;
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- elem = &nf_hooks[pf][hook];
+ elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list);
next_hook:
- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
+ verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
outdev, &elem, okfn, hook_thresh);
if (verdict == NF_ACCEPT || verdict == NF_STOP) {
ret = 1;
- goto unlock;
- } else if (verdict == NF_DROP) {
- kfree_skb(*pskb);
- ret = -EPERM;
- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- NFDEBUG("nf_hook: Verdict = QUEUE.\n");
- if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
+ kfree_skb(skb);
+ ret = NF_DROP_GETERR(verdict);
+ if (ret == 0)
+ ret = -EPERM;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+ int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
}
-unlock:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(nf_hook_slow);
-int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
+int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
{
- struct sk_buff *nskb;
-
- if (writable_len > (*pskb)->len)
+ if (writable_len > skb->len)
return 0;
/* Not exclusive use of packet? Must copy. */
- if (skb_shared(*pskb) || skb_cloned(*pskb))
- goto copy_skb;
+ if (!skb_cloned(skb)) {
+ if (writable_len <= skb_headlen(skb))
+ return 1;
+ } else if (skb_clone_writable(skb, writable_len))
+ return 1;
- return pskb_may_pull(*pskb, writable_len);
+ if (writable_len <= skb_headlen(skb))
+ writable_len = 0;
+ else
+ writable_len -= skb_headlen(skb);
-copy_skb:
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return 0;
- BUG_ON(skb_is_nonlinear(nskb));
-
- /* Rest of kernel will get very unhappy if we pass it a
- suddenly-orphaned skbuff */
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- return 1;
+ return !!__pskb_pull_tail(skb, writable_len);
}
EXPORT_SYMBOL(skb_make_writable);
-
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
+ __rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, struct sk_buff *);
+ void (*attach)(struct sk_buff *, const struct sk_buff *);
- if (skb->nfct && (attach = ip_ct_attach) != NULL) {
- mb(); /* Just to be sure: must be read before executing this */
- attach(new, skb);
+ if (skb->nfct) {
+ rcu_read_lock();
+ attach = rcu_dereference(ip_ct_attach);
+ if (attach)
+ attach(new, skb);
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(nf_ct_attach);
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
+EXPORT_SYMBOL(nf_ct_destroy);
+
+void nf_conntrack_destroy(struct nf_conntrack *nfct)
+{
+ void (*destroy)(struct nf_conntrack *);
+
+ rcu_read_lock();
+ destroy = rcu_dereference(nf_ct_destroy);
+ BUG_ON(destroy == NULL);
+ destroy(nfct);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_conntrack_destroy);
+
+struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_hook);
+
+struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
+
+#endif /* CONFIG_NF_CONNTRACK */
+
+#ifdef CONFIG_NF_NAT_NEEDED
+void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(nf_nat_decode_session_hook);
+#endif
+
+static int __net_init netfilter_net_init(struct net *net)
+{
#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
+ net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+ net->proc_net);
+ if (!net->nf.proc_netfilter) {
+ if (!net_eq(net, &init_net))
+ pr_err("cannot create netfilter proc entry");
+
+ return -ENOMEM;
+ }
#endif
+ return 0;
+}
-void __init netfilter_init(void)
+static void __net_exit netfilter_net_exit(struct net *net)
{
- int i, h;
- for (i = 0; i < NPROTO; i++) {
+ remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+ .init = netfilter_net_init,
+ .exit = netfilter_net_exit,
+};
+
+int __init netfilter_init(void)
+{
+ int i, h, ret;
+
+ for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
for (h = 0; h < NF_MAX_HOOKS; h++)
INIT_LIST_HEAD(&nf_hooks[i][h]);
}
-#ifdef CONFIG_PROC_FS
- proc_net_netfilter = proc_mkdir("netfilter", proc_net);
- if (!proc_net_netfilter)
- panic("cannot create netfilter proc entry");
-#endif
+ ret = register_pernet_subsys(&netfilter_net_ops);
+ if (ret < 0)
+ goto err;
- if (netfilter_queue_init() < 0)
- panic("cannot initialize nf_queue");
- if (netfilter_log_init() < 0)
- panic("cannot initialize nf_log");
+ ret = netfilter_log_init();
+ if (ret < 0)
+ goto err_pernet;
+
+ return 0;
+err_pernet:
+ unregister_pernet_subsys(&netfilter_net_ops);
+err:
+ return ret;
}
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 00000000000..2f7f5c32c6f
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,159 @@
+menuconfig IP_SET
+ tristate "IP set support"
+ depends on INET && NETFILTER
+ select NETFILTER_NETLINK
+ help
+ This option adds IP set support to the kernel.
+ In order to define and use the sets, you need the userspace utility
+ ipset(8). You can use the sets in netfilter via the "set" match
+ and "SET" target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+ int "Maximum number of IP sets"
+ default 256
+ range 2 65534
+ depends on IP_SET
+ help
+ You can define here default value of the maximum number
+ of IP sets for the kernel.
+
+ The value can be overridden by the 'max_sets' module
+ parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+ tristate "bitmap:ip set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip set type support, by which one
+ can store IPv4 addresses (or network addresse) from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+ tristate "bitmap:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip,mac set type support, by which one
+ can store IPv4 address and (source) MAC address pairs from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+ tristate "bitmap:port set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:port set type support, by which one
+ can store TCP/UDP port numbers from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IP
+ tristate "hash:ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip set type support, by which one
+ can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+ in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPMARK
+ tristate "hash:ip,mark set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,mark set type support, by which one
+ can store IPv4/IPv6 address and mark pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+ tristate "hash:ip,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port set type support, by which one
+ can store IPv4/IPv6 address and protocol/port pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+ tristate "hash:ip,port,ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,ip set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ address triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+ tristate "hash:ip,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,net set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ network address/prefix triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORTNET
+ tristate "hash:net,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port,net set type support, by which
+ one can store two IPv4/IPv6 subnets, and a protocol/port in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NET
+ tristate "hash:net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net set type support, by which
+ one can store IPv4/IPv6 network address/prefix elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETNET
+ tristate "hash:net,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,net set type support, by which
+ one can store IPv4/IPv6 network address/prefix pairs in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+ tristate "hash:net,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ protocol/port pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETIFACE
+ tristate "hash:net,iface set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,iface set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ interface name pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_LIST_SET
+ tristate "list:set set support"
+ depends on IP_SET
+ help
+ This option adds the list:set set type support. In this
+ kind of set one can store the name of other sets and it forms
+ an ordered union of the member sets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 00000000000..231f10196cb
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,28 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o
+obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o
+obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
new file mode 100644
index 00000000000..f2c7d83dc23
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -0,0 +1,289 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IP_SET_BITMAP_IP_GEN_H
+#define __IP_SET_BITMAP_IP_GEN_H
+
+#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
+#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
+#define mtype_do_add IPSET_TOKEN(MTYPE, _do_add)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_do_del IPSET_TOKEN(MTYPE, _do_del)
+#define mtype_do_list IPSET_TOKEN(MTYPE, _do_list)
+#define mtype_do_head IPSET_TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype MTYPE
+
+#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct mtype *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+mtype_ext_cleanup(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+ u32 id;
+
+ for (id = 0; id < map->elements; id++)
+ if (test_bit(id, map->members))
+ ip_set_ext_destroy(set, get_ext(set, map, id));
+}
+
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ if (set->dsize) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map->extensions);
+ }
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct mtype *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (mtype_do_head(skb, map) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) +
+ map->memsize +
+ set->dsize * map->elements)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_test(e, map, set->dsize);
+
+ if (ret <= 0)
+ return ret;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return 0;
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(x, set), ext, mext, flags);
+ return 1;
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_add(e, map, flags, set->dsize);
+
+ if (ret == IPSET_ADD_FAILED) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ ret = 0;
+ else if (!(flags & IPSET_FLAG_EXIST))
+ return -IPSET_ERR_EXIST;
+ /* Element is re-added, cleanup extensions */
+ ip_set_ext_destroy(set, x);
+ }
+
+ if (SET_WITH_TIMEOUT(set))
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret);
+#else
+ ip_set_timeout_set(ext_timeout(x, set), ext->timeout);
+#endif
+
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(x, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(x, set), ext);
+ return 0;
+}
+
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+
+ if (mtype_do_del(e, map))
+ return -IPSET_ERR_EXIST;
+
+ ip_set_ext_destroy(set, x);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+#ifndef IP_SET_BITMAP_STORED_TIMEOUT
+static inline bool
+mtype_is_filled(const struct mtype_elem *x)
+{
+ return true;
+}
+#endif
+
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct mtype *map = set->data;
+ struct nlattr *adt, *nested;
+ void *x;
+ u32 id, first = cb->args[IPSET_CB_ARG0];
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->elements;
+ cb->args[IPSET_CB_ARG0]++) {
+ id = cb->args[IPSET_CB_ARG0];
+ x = get_ext(set, map, id);
+ if (!test_bit(id, map->members) ||
+ (SET_WITH_TIMEOUT(set) &&
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_is_filled((const struct mtype_elem *) x) &&
+#endif
+ ip_set_timeout_expired(ext_timeout(x, set))))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_do_list(skb, map, id, set->dsize))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, x,
+ mtype_is_filled((const struct mtype_elem *) x)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(id == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, adt);
+ return 0;
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct mtype *map = set->data;
+ void *x;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (mtype_gc_test(id, map, set->dsize)) {
+ x = get_ext(set, map, id);
+ if (ip_set_timeout_expired(ext_timeout(x, set))) {
+ clear_bit(id, map->members);
+ ip_set_ext_destroy(set, x);
+ }
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static const struct ip_set_type_variant mtype = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .same_set = mtype_same_set,
+};
+
+#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 00000000000..6f1f9f49480
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,377 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+#define MTYPE bitmap_ip
+
+/* Type structure */
+struct bitmap_ip {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ u32 hosts; /* number of hosts in a subnet */
+ size_t memsize; /* members size */
+ u8 netmask; /* subnet netmask */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ip_adt_elem {
+ u16 id;
+};
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e,
+ struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
+ u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+}
+
+static inline int
+bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
+ (map->netmask != 32 &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask));
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip = 0, ip_to = 0;
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (adt == IPSET_TEST) {
+ e.id = ip_to_id(map, ip);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to) {
+ swap(ip, ip_to);
+ if (ip < map->first_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ } else
+ ip_to = ip;
+
+ if (ip_to > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; !before(ip_to, ip); ip += map->hosts) {
+ e.id = ip_to_id(map, ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ip *x = a->data;
+ const struct bitmap_ip *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->netmask == y->netmask &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_ip_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+ u32 first_ip, u32 last_ip,
+ u32 elements, u32 hosts, u8 netmask)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ map->hosts = hosts;
+ map->netmask = netmask;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_ip *map;
+ u32 first_ip = 0, last_ip = 0, hosts;
+ u64 elements;
+ u8 netmask = 32;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if (netmask > 32)
+ return -IPSET_ERR_INVALID_NETMASK;
+
+ first_ip &= ip_set_hostmask(netmask);
+ last_ip |= ~ip_set_hostmask(netmask);
+ }
+
+ if (netmask == 32) {
+ hosts = 1;
+ elements = (u64)last_ip - first_ip + 1;
+ } else {
+ u8 mask_bits;
+ u32 mask;
+
+ mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+ if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+ netmask <= mask_bits)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+ hosts = 2 << (32 - netmask - 1);
+ elements = 2 << (netmask - mask_bits - 1);
+ }
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ pr_debug("hosts %u, elements %llu\n",
+ hosts, (unsigned long long)elements);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ip;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ip_gc_init(set, bitmap_ip_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+ .name = "bitmap:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+ return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 00000000000..740eabededd
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,414 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+#define MTYPE bitmap_ipmac
+#define IP_SET_BITMAP_STORED_TIMEOUT
+
+enum {
+ MAC_UNSET, /* element is set, without MAC */
+ MAC_FILLED, /* element is set with MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+ void *members; /* the set members */
+ void *extensions; /* MAC + data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collector */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ipmac_adt_elem {
+ u16 id;
+ unsigned char *ether;
+};
+
+struct bitmap_ipmac_elem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+} __attribute__ ((aligned));
+
+static inline u32
+ip_to_id(const struct bitmap_ipmac *m, u32 ip)
+{
+ return ip - m->first_ip;
+}
+
+static inline struct bitmap_ipmac_elem *
+get_elem(void *extensions, u16 id, size_t dsize)
+{
+ return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
+ const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(e->id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (elem->filled == MAC_FILLED)
+ return e->ether == NULL ||
+ ether_addr_equal(e->ether, elem->ether);
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+}
+
+static inline int
+bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, id, dsize);
+ /* Timer not started for the incomplete elements */
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
+{
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_add_timeout(unsigned long *timeout,
+ const struct bitmap_ipmac_adt_elem *e,
+ const struct ip_set_ext *ext, struct ip_set *set,
+ struct bitmap_ipmac *map, int mode)
+{
+ u32 t = ext->timeout;
+
+ if (mode == IPSET_ADD_START_STORED_TIMEOUT) {
+ if (t == set->timeout)
+ /* Timeout was not specified, get stored one */
+ t = *timeout;
+ ip_set_timeout_set(timeout, t);
+ } else {
+ /* If MAC is unset yet, we store plain timeout value
+ * because the timer is not activated yet
+ * and we can reuse it later when MAC is filled out,
+ * possibly by the kernel */
+ if (e->ether)
+ ip_set_timeout_set(timeout, t);
+ else
+ *timeout = t;
+ }
+ return 0;
+}
+
+static inline int
+bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map, u32 flags, size_t dsize)
+{
+ struct bitmap_ipmac_elem *elem;
+
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (test_and_set_bit(e->id, map->members)) {
+ if (elem->filled == MAC_FILLED) {
+ if (e->ether && (flags & IPSET_FLAG_EXIST))
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ return IPSET_ADD_FAILED;
+ } else if (!e->ether)
+ /* Already added without ethernet address */
+ return IPSET_ADD_FAILED;
+ /* Fill the MAC address and trigger the timer activation */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return IPSET_ADD_START_STORED_TIMEOUT;
+ } else if (e->ether) {
+ /* We can store MAC too */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return 0;
+ } else {
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
+ }
+}
+
+static inline int
+bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
+ u32 id, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem =
+ get_elem(map->extensions, id, dsize);
+
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id)) ||
+ (elem->filled == MAC_FILLED &&
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
+}
+
+static inline int
+bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ /* Backward compatibility: we don't check the second flag */
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ e.id = ip_to_id(map, ip);
+ e.ether = eth_hdr(skb)->h_source;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+ if (tb[IPSET_ATTR_ETHER])
+ e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ else
+ e.ether = NULL;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ipmac *x = a->data;
+ const struct bitmap_ipmac *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+ u32 first_ip, u32 last_ip, u32 elements)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 first_ip = 0, last_ip = 0;
+ u64 elements;
+ struct bitmap_ipmac *map;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ elements = (u64)last_ip - first_ip + 1;
+
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ipmac;
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem));
+ if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+ .name = "bitmap:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+ return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 00000000000..cf99676e69f
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,311 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comment support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:port");
+
+#define MTYPE bitmap_port
+
+/* Type structure */
+struct bitmap_port {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u16 first_port; /* host byte order, included in range */
+ u16 last_port; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_port_adt_elem {
+ u16 id;
+};
+
+static inline u16
+port_to_id(const struct bitmap_port *m, u16 port)
+{
+ return port - m->first_port;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
+ const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map, u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+}
+
+static inline int
+bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be16 __port;
+ u16 port = 0;
+
+ if (!ip_set_get_ip_port(skb, opt->family,
+ opt->flags & IPSET_DIM_ONE_SRC, &__port))
+ return -EINVAL;
+
+ port = ntohs(__port);
+
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = port_to_id(map, port);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port; /* wraparound */
+ u16 port_to;
+ int ret = 0;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (adt == IPSET_TEST) {
+ e.id = port_to_id(map, port);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to) {
+ swap(port, port_to);
+ if (port < map->first_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else
+ port_to = port;
+
+ if (port_to > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; port <= port_to; port++) {
+ e.id = port_to_id(map, port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_port *x = a->data;
+ const struct bitmap_port *y = b->data;
+
+ return x->first_port == y->first_port &&
+ x->last_port == y->last_port &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_port_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+ u16 first_port, u16 last_port)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * map->elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_port = first_port;
+ map->last_port = last_port;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_UNSPEC;
+
+ return true;
+}
+
+static int
+bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_port *map;
+ u16 first_port, last_port;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (first_port > last_port) {
+ u16 tmp = first_port;
+
+ first_port = last_port;
+ last_port = tmp;
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->elements = last_port - first_port + 1;
+ map->memsize = bitmap_bytes(0, map->elements);
+ set->variant = &bitmap_port;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_port_gc_init(set, bitmap_port_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+ .name = "bitmap:port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_port_create,
+ .create_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+ return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+ ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 00000000000..ec8114fae50
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,2011 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/rculist.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list); /* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */
+static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */
+
+struct ip_set_net {
+ struct ip_set * __rcu *ip_set_list; /* all individual sets */
+ ip_set_id_t ip_set_max; /* max number of sets */
+ int is_deleted; /* deleted by ip_set_net_exit */
+};
+static int ip_set_net_id __read_mostly;
+
+static inline struct ip_set_net *ip_set_pernet(struct net *net)
+{
+ return net_generic(net, ip_set_net_id);
+}
+
+#define IP_SET_INC 64
+#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/* When the nfnl mutex is held: */
+#define ip_set_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ip_set(inst, id) \
+ ip_set_dereference((inst)->ip_set_list)[id]
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+ mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+ mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+ struct ip_set_type *type;
+
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC) &&
+ revision >= type->revision_min &&
+ revision <= type->revision_max)
+ return type;
+ return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static bool
+load_settype(const char *name)
+{
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ pr_debug("try to load ip_set_%s\n", name);
+ if (request_module("ip_set_%s", name) < 0) {
+ pr_warning("Can't find ip_set type %s\n", name);
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return false;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return true;
+}
+
+/* Find a set type and reference it */
+#define find_set_type_get(name, family, revision, found) \
+ __find_set_type_get(name, family, revision, found, false)
+
+static int
+__find_set_type_get(const char *name, u8 family, u8 revision,
+ struct ip_set_type **found, bool retry)
+{
+ struct ip_set_type *type;
+ int err;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ rcu_read_lock();
+ *found = find_set_type(name, family, revision);
+ if (*found) {
+ err = !try_module_get((*found)->me) ? -EFAULT : 0;
+ goto unlock;
+ }
+ /* Make sure the type is already loaded
+ * but we don't support the revision */
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name)) {
+ err = -IPSET_ERR_FIND_TYPE;
+ goto unlock;
+ }
+ rcu_read_unlock();
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_get(name, family, revision, found, true);
+
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+#define find_set_type_minmax(name, family, min, max) \
+ __find_set_type_minmax(name, family, min, max, false)
+
+static int
+__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
+ bool retry)
+{
+ struct ip_set_type *type;
+ bool found = false;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ *min = 255; *max = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC)) {
+ found = true;
+ if (type->revision_min < *min)
+ *min = type->revision_min;
+ if (type->revision_max > *max)
+ *max = type->revision_max;
+ }
+ rcu_read_unlock();
+ if (found)
+ return 0;
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_minmax(name, family, min, max, true);
+}
+
+#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \
+ (f) == NFPROTO_IPV6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+ int ret = 0;
+
+ if (type->protocol != IPSET_PROTOCOL) {
+ pr_warning("ip_set type %s, family %s, revision %u:%u uses "
+ "wrong protocol version %u (want %u)\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max,
+ type->protocol, IPSET_PROTOCOL);
+ return -EINVAL;
+ }
+
+ ip_set_type_lock();
+ if (find_set_type(type->name, type->family, type->revision_min)) {
+ /* Duplicate! */
+ pr_warning("ip_set type %s, family %s with revision min %u "
+ "already registered!\n", type->name,
+ family_name(type->family), type->revision_min);
+ ret = -EINVAL;
+ goto unlock;
+ }
+ list_add_rcu(&type->list, &ip_set_type_list);
+ pr_debug("type %s, family %s, revision %u:%u registered.\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max);
+unlock:
+ ip_set_type_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+ ip_set_type_lock();
+ if (!find_set_type(type->name, type->family, type->revision_min)) {
+ pr_warning("ip_set type %s, family %s with revision min %u "
+ "not registered\n", type->name,
+ family_name(type->family), type->revision_min);
+ goto unlock;
+ }
+ list_del_rcu(&type->list);
+ pr_debug("type %s, family %s with revision min %u unregistered.\n",
+ type->name, family_name(type->family), type->revision_min);
+unlock:
+ ip_set_type_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+ void *members = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+ if (members) {
+ pr_debug("%p: allocated with kmalloc\n", members);
+ return members;
+ }
+
+ members = vzalloc(size);
+ if (!members)
+ return NULL;
+ pr_debug("%p: allocated with vmalloc\n", members);
+
+ return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+ pr_debug("%p: free with %s\n", members,
+ is_vmalloc_addr(members) ? "vfree" : "kfree");
+ kvfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+ return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+ [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
+ [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+ return -IPSET_ERR_PROTOCOL;
+
+ *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+ return -IPSET_ERR_PROTOCOL;
+
+ memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+ sizeof(struct in6_addr));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+typedef void (*destroyer)(void *);
+/* ipset data extension types, in size order */
+
+const struct ip_set_ext_type ip_set_extensions[] = {
+ [IPSET_EXT_ID_COUNTER] = {
+ .type = IPSET_EXT_COUNTER,
+ .flag = IPSET_FLAG_WITH_COUNTERS,
+ .len = sizeof(struct ip_set_counter),
+ .align = __alignof__(struct ip_set_counter),
+ },
+ [IPSET_EXT_ID_TIMEOUT] = {
+ .type = IPSET_EXT_TIMEOUT,
+ .len = sizeof(unsigned long),
+ .align = __alignof__(unsigned long),
+ },
+ [IPSET_EXT_ID_COMMENT] = {
+ .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY,
+ .flag = IPSET_FLAG_WITH_COMMENT,
+ .len = sizeof(struct ip_set_comment),
+ .align = __alignof__(struct ip_set_comment),
+ .destroy = (destroyer) ip_set_comment_free,
+ },
+};
+EXPORT_SYMBOL_GPL(ip_set_extensions);
+
+static inline bool
+add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
+{
+ return ip_set_extensions[id].flag ?
+ (flags & ip_set_extensions[id].flag) :
+ !!tb[IPSET_ATTR_TIMEOUT];
+}
+
+size_t
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+{
+ enum ip_set_ext_id id;
+ size_t offset = 0;
+ u32 cadt_flags = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
+ set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
+ if (!add_extension(id, cadt_flags, tb))
+ continue;
+ offset += ALIGN(len + offset, ip_set_extensions[id].align);
+ set->offset[id] = offset;
+ set->extensions |= ip_set_extensions[id].type;
+ offset += ip_set_extensions[id].len;
+ }
+ return len + offset;
+}
+EXPORT_SYMBOL_GPL(ip_set_elem_len);
+
+int
+ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+ struct ip_set_ext *ext)
+{
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ return -IPSET_ERR_TIMEOUT;
+ ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+ if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
+ if (!(set->extensions & IPSET_EXT_COUNTER))
+ return -IPSET_ERR_COUNTER;
+ if (tb[IPSET_ATTR_BYTES])
+ ext->bytes = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_BYTES]));
+ if (tb[IPSET_ATTR_PACKETS])
+ ext->packets = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_PACKETS]));
+ }
+ if (tb[IPSET_ATTR_COMMENT]) {
+ if (!(set->extensions & IPSET_EXT_COMMENT))
+ return -IPSET_ERR_COMMENT;
+ ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static inline void
+__ip_set_put(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ BUG_ON(set->ref == 0);
+ set->ref--;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+static inline struct ip_set *
+ip_set_rcu_get(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ /* ip_set_list itself needs to be protected */
+ set = rcu_dereference(inst->ip_set_list)[index];
+ rcu_read_unlock();
+
+ return set;
+}
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return 0;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
+ read_unlock_bh(&set->lock);
+
+ if (ret == -EAGAIN) {
+ /* Type requests element to be completed */
+ pr_debug("element must be competed, ADD is triggered\n");
+ write_lock_bh(&set->lock);
+ set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+ ret = 1;
+ } else {
+ /* --return-nomatch: invert matched element */
+ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
+ (set->type->features & IPSET_TYPE_NOMATCH) &&
+ (ret > 0 || ret == -ENOTEMPTY))
+ ret = -ret;
+ }
+
+ /* Convert error codes to nomatch */
+ return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ */
+ip_set_id_t
+ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ struct ip_set *s;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = rcu_dereference(inst->ip_set_list)[i];
+ if (s != NULL && STREQ(s->name, name)) {
+ __ip_set_get(s);
+ index = i;
+ *set = s;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ */
+
+static inline void
+__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set;
+
+ rcu_read_lock();
+ set = rcu_dereference(inst->ip_set_list)[index];
+ if (set != NULL)
+ __ip_set_put(set);
+ rcu_read_unlock();
+}
+
+void
+ip_set_put_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ __ip_set_put_byindex(inst, index);
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ */
+const char *
+ip_set_name_byindex(struct net *net, ip_set_id_t index)
+{
+ const struct ip_set *set = ip_set_rcu_get(net, index);
+
+ BUG_ON(set == NULL);
+ BUG_ON(set->ref == 0);
+
+ /* Referenced, so it's safe */
+ return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (index > inst->ip_set_max)
+ return IPSET_INVALID_ID;
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, index);
+ if (set)
+ __ip_set_get(set);
+ else
+ index = IPSET_INVALID_ID;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
+ set = ip_set(inst, index);
+ if (set != NULL)
+ __ip_set_put(set);
+ }
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * The commands are serialized by the nfnl mutex.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] ||
+ nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+ return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
+ enum ipset_cmd cmd)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+ sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ return NULL;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = NFPROTO_IPV4;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1},
+ [IPSET_ATTR_REVISION] = { .type = NLA_U8 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+};
+
+static struct ip_set *
+find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
+{
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ *id = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL && STREQ(set->name, name)) {
+ *id = i;
+ break;
+ }
+ }
+ return (*id == IPSET_INVALID_ID ? NULL : set);
+}
+
+static inline struct ip_set *
+find_set(struct ip_set_net *inst, const char *name)
+{
+ ip_set_id_t id;
+
+ return find_set_and_id(inst, name, &id);
+}
+
+static int
+find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
+ struct ip_set **set)
+{
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ *index = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s == NULL) {
+ if (*index == IPSET_INVALID_ID)
+ *index = i;
+ } else if (STREQ(name, s->name)) {
+ /* Name clash */
+ *set = s;
+ return -EEXIST;
+ }
+ }
+ if (*index == IPSET_INVALID_ID)
+ /* No free slot remained */
+ return -IPSET_ERR_MAX_SETS;
+ return 0;
+}
+
+static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct net *net = sock_net(ctnl);
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set *set, *clash = NULL;
+ ip_set_id_t index = IPSET_INVALID_ID;
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ const char *name, *typename;
+ u8 family, revision;
+ u32 flags = flag_exist(nlh);
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_REVISION] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA]))))
+ return -IPSET_ERR_PROTOCOL;
+
+ name = nla_data(attr[IPSET_ATTR_SETNAME]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+ pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+ name, typename, family_name(family), revision);
+
+ /*
+ * First, and without any locks, allocate and initialize
+ * a normal base set structure.
+ */
+ set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+ rwlock_init(&set->lock);
+ strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ set->family = family;
+ set->revision = revision;
+
+ /*
+ * Next, check that we know the type, and take
+ * a reference on the type, to make sure it stays available
+ * while constructing our new set.
+ *
+ * After referencing the type, we try to create the type
+ * specific part of the set without holding any locks.
+ */
+ ret = find_set_type_get(typename, family, revision, &(set->type));
+ if (ret)
+ goto out;
+
+ /*
+ * Without holding any locks, create private part.
+ */
+ if (attr[IPSET_ATTR_DATA] &&
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy)) {
+ ret = -IPSET_ERR_PROTOCOL;
+ goto put_out;
+ }
+
+ ret = set->type->create(net, set, tb, flags);
+ if (ret != 0)
+ goto put_out;
+
+ /* BTW, ret==0 here. */
+
+ /*
+ * Here, we have a valid, constructed set and we are protected
+ * by the nfnl mutex. Find the first free index in ip_set_list
+ * and check clashing.
+ */
+ ret = find_free_id(inst, set->name, &index, &clash);
+ if (ret == -EEXIST) {
+ /* If this is the same set and requested, ignore error */
+ if ((flags & IPSET_FLAG_EXIST) &&
+ STREQ(set->type->name, clash->type->name) &&
+ set->type->family == clash->type->family &&
+ set->type->revision_min == clash->type->revision_min &&
+ set->type->revision_max == clash->type->revision_max &&
+ set->variant->same_set(set, clash))
+ ret = 0;
+ goto cleanup;
+ } else if (ret == -IPSET_ERR_MAX_SETS) {
+ struct ip_set **list, **tmp;
+ ip_set_id_t i = inst->ip_set_max + IP_SET_INC;
+
+ if (i < inst->ip_set_max || i == IPSET_INVALID_ID)
+ /* Wraparound */
+ goto cleanup;
+
+ list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+ if (!list)
+ goto cleanup;
+ /* nfnl mutex is held, both lists are valid */
+ tmp = ip_set_dereference(inst->ip_set_list);
+ memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
+ rcu_assign_pointer(inst->ip_set_list, list);
+ /* Make sure all current packets have passed through */
+ synchronize_net();
+ /* Use new list */
+ index = inst->ip_set_max;
+ inst->ip_set_max = i;
+ kfree(tmp);
+ ret = 0;
+ } else if (ret)
+ goto cleanup;
+
+ /*
+ * Finally! Add our shiny new set to the list, and be done.
+ */
+ pr_debug("create: '%s' created with index %u!\n", set->name, index);
+ ip_set(inst, index) = set;
+
+ return ret;
+
+cleanup:
+ set->variant->destroy(set);
+put_out:
+ module_put(set->type->me);
+out:
+ kfree(set);
+ return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set = ip_set(inst, index);
+
+ pr_debug("set: %s\n", set->name);
+ ip_set(inst, index) = NULL;
+
+ /* Must call it without holding any lock */
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ /* Commands are serialized and references are
+ * protected by the ip_set_ref_lock.
+ * External systems (i.e. xt_set) must call
+ * ip_set_put|get_nfnl_* functions, that way we
+ * can safely check references here.
+ *
+ * list:set timer can only decrement the reference
+ * counter, so if it's already zero, we can proceed
+ * without holding the lock.
+ */
+ read_lock_bh(&ip_set_ref_lock);
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ } else {
+ s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &i);
+ if (s == NULL) {
+ ret = -ENOENT;
+ goto out;
+ } else if (s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+
+ ip_set_destroy_set(inst, i);
+ }
+ return 0;
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+ pr_debug("set: %s\n", set->name);
+
+ write_lock_bh(&set->lock);
+ set->variant->flush(set);
+ write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_flush_set(s);
+ }
+ } else {
+ s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (s == NULL)
+ return -ENOENT;
+
+ ip_set_flush_set(s);
+ }
+
+ return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set, *s;
+ const char *name2;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ read_lock_bh(&ip_set_ref_lock);
+ if (set->ref != 0) {
+ ret = -IPSET_ERR_REFERENCED;
+ goto out;
+ }
+
+ name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && STREQ(s->name, name2)) {
+ ret = -IPSET_ERR_EXIST_SETNAME2;
+ goto out;
+ }
+ }
+ strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * The commands are serialized by the nfnl mutex and references are
+ * protected by the ip_set_ref_lock. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *from, *to;
+ ip_set_id_t from_id, to_id;
+ char from_name[IPSET_MAXNAMELEN];
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &from_id);
+ if (from == NULL)
+ return -ENOENT;
+
+ to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
+ &to_id);
+ if (to == NULL)
+ return -IPSET_ERR_EXIST_SETNAME2;
+
+ /* Features must not change.
+ * Not an artificial restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets. */
+ if (!(from->type->features == to->type->features &&
+ from->family == to->family))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+ strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+ strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+
+ write_lock_bh(&ip_set_ref_lock);
+ swap(from->ref, to->ref);
+ ip_set(inst, from_id) = to;
+ ip_set(inst, to_id) = from;
+ write_unlock_bh(&ip_set_ref_lock);
+
+ return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT 0
+#define DUMP_ALL 1
+#define DUMP_ONE 2
+#define DUMP_LAST 3
+
+#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF)
+#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16)
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+ struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ if (cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n",
+ ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
+ __ip_set_put_byindex(inst,
+ (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+ }
+ return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ pr_debug("dump nlmsg\n");
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+ pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+ }
+}
+
+static int
+dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ u32 dump_type;
+ ip_set_id_t index;
+
+ /* Second pass, so parser can't fail */
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+ /* cb->args[IPSET_CB_NET]: net namespace
+ * [IPSET_CB_DUMP]: dump single set/all sets
+ * [IPSET_CB_INDEX]: set index
+ * [IPSET_CB_ARG0]: type specific
+ */
+
+ if (cda[IPSET_ATTR_SETNAME]) {
+ struct ip_set *set;
+
+ set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
+ &index);
+ if (set == NULL)
+ return -ENOENT;
+
+ dump_type = DUMP_ONE;
+ cb->args[IPSET_CB_INDEX] = index;
+ } else
+ dump_type = DUMP_ALL;
+
+ if (cda[IPSET_ATTR_FLAGS]) {
+ u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+ dump_type |= (f << 16);
+ }
+ cb->args[IPSET_CB_NET] = (unsigned long)inst;
+ cb->args[IPSET_CB_DUMP] = dump_type;
+
+ return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ ip_set_id_t index = IPSET_INVALID_ID, max;
+ struct ip_set *set = NULL;
+ struct nlmsghdr *nlh = NULL;
+ unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
+ u32 dump_type, dump_flags;
+ int ret = 0;
+
+ if (!cb->args[IPSET_CB_DUMP]) {
+ ret = dump_init(cb, inst);
+ if (ret < 0) {
+ nlh = nlmsg_hdr(cb->skb);
+ /* We have to create and send the error message
+ * manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(cb->skb, nlh, ret);
+ return ret;
+ }
+ }
+
+ if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
+ goto out;
+
+ dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]);
+ dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]);
+ max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1
+ : inst->ip_set_max;
+dump_last:
+ pr_debug("dump type, flag: %u %u index: %ld\n",
+ dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
+ for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
+ index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+ set = ip_set(inst, index);
+ if (set == NULL) {
+ if (dump_type == DUMP_ONE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ continue;
+ }
+ /* When dumping all sets, we must dump "sorted"
+ * so that lists (unions of sets) are dumped last.
+ */
+ if (dump_type != DUMP_ONE &&
+ ((dump_type == DUMP_ALL) ==
+ !!(set->type->features & IPSET_DUMP_LAST)))
+ continue;
+ pr_debug("List set: %s\n", set->name);
+ if (!cb->args[IPSET_CB_ARG0]) {
+ /* Start listing: make sure set won't be destroyed */
+ pr_debug("reference set\n");
+ __ip_set_get(set);
+ }
+ nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ IPSET_CMD_LIST);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto release_refcount;
+ }
+ if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
+ goto nla_put_failure;
+ if (dump_flags & IPSET_FLAG_LIST_SETNAME)
+ goto next_set;
+ switch (cb->args[IPSET_CB_ARG0]) {
+ case 0:
+ /* Core header data */
+ if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
+ set->type->name) ||
+ nla_put_u8(skb, IPSET_ATTR_FAMILY,
+ set->family) ||
+ nla_put_u8(skb, IPSET_ATTR_REVISION,
+ set->revision))
+ goto nla_put_failure;
+ ret = set->variant->head(set, skb);
+ if (ret < 0)
+ goto release_refcount;
+ if (dump_flags & IPSET_FLAG_LIST_HEADER)
+ goto next_set;
+ /* Fall through and add elements */
+ default:
+ read_lock_bh(&set->lock);
+ ret = set->variant->list(set, skb, cb);
+ read_unlock_bh(&set->lock);
+ if (!cb->args[IPSET_CB_ARG0])
+ /* Set is done, proceed with next one */
+ goto next_set;
+ goto release_refcount;
+ }
+ }
+ /* If we dump all sets, continue with dumping last ones */
+ if (dump_type == DUMP_ALL) {
+ dump_type = DUMP_LAST;
+ cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
+ cb->args[IPSET_CB_INDEX] = 0;
+ goto dump_last;
+ }
+ goto out;
+
+nla_put_failure:
+ ret = -EFAULT;
+next_set:
+ if (dump_type == DUMP_ONE)
+ cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;
+ else
+ cb->args[IPSET_CB_INDEX]++;
+release_refcount:
+ /* If there was an error or set is done, release set */
+ if (ret || !cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n", ip_set(inst, index)->name);
+ __ip_set_put_byindex(inst, index);
+ cb->args[IPSET_CB_ARG0] = 0;
+ }
+out:
+ if (nlh) {
+ nlmsg_end(skb, nlh);
+ pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+ dump_attrs(nlh);
+ }
+
+ return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ {
+ struct netlink_dump_control c = {
+ .dump = ip_set_dump_start,
+ .done = ip_set_dump_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ADT] = { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+ struct nlattr *tb[], enum ipset_adt adt,
+ u32 flags, bool use_lineno)
+{
+ int ret;
+ u32 lineno = 0;
+ bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
+
+ do {
+ write_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
+ write_unlock_bh(&set->lock);
+ retried = true;
+ } while (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried)) == 0);
+
+ if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+ return 0;
+ if (lineno && use_lineno) {
+ /* Error in restore/batch mode: send back lineno */
+ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ struct nlmsgerr *errmsg;
+ size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cmdattr;
+ u32 *errline;
+
+ skb2 = nlmsg_new(payload, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+ rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = ret;
+ memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ cmdattr = (void *)&errmsg->msg + min_len;
+
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ cmdattr, nlh->nlmsg_len - min_len,
+ ip_set_adt_policy);
+
+ errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+ *errline = lineno;
+
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ /* Signal netlink not to send its ACK/errmsg. */
+ return -EINTR;
+ }
+
+ return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(*tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_DATA] == NULL ||
+ !flag_nested(attr[IPSET_ATTR_DATA])))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
+ read_unlock_bh(&set->lock);
+ /* Userspace can't trigger element to be re-added */
+ if (ret == -EAGAIN)
+ ret = 1;
+
+ return ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ const struct ip_set *set;
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_HEADER);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ u8 family, min, max;
+ const char *typename;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ ret = find_set_type_minmax(typename, family, &min, &max);
+ if (ret)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_TYPE);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_PROTOCOL);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_NONE] = {
+ .call = ip_set_none,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ },
+ [IPSET_CMD_CREATE] = {
+ .call = ip_set_create,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_create_policy,
+ },
+ [IPSET_CMD_DESTROY] = {
+ .call = ip_set_destroy,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_FLUSH] = {
+ .call = ip_set_flush,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_RENAME] = {
+ .call = ip_set_rename,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_SWAP] = {
+ .call = ip_set_swap,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_LIST] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_SAVE] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_ADD] = {
+ .call = ip_set_uadd,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_DEL] = {
+ .call = ip_set_udel,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_TEST] = {
+ .call = ip_set_utest,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_HEADER] = {
+ .call = ip_set_header,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_TYPE] = {
+ .call = ip_set_type,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_type_policy,
+ },
+ [IPSET_CMD_PROTOCOL] = {
+ .call = ip_set_protocol,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_protocol_policy,
+ },
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+ .name = "ip_set",
+ .subsys_id = NFNL_SUBSYS_IPSET,
+ .cb_count = IPSET_MSG_MAX,
+ .cb = ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+ unsigned int *op;
+ void *data;
+ int copylen = *len, ret = 0;
+ struct net *net = sock_net(sk);
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optval != SO_IP_SET)
+ return -EBADF;
+ if (*len < sizeof(unsigned int))
+ return -EINVAL;
+
+ data = vmalloc(*len);
+ if (!data)
+ return -ENOMEM;
+ if (copy_from_user(data, user, *len) != 0) {
+ ret = -EFAULT;
+ goto done;
+ }
+ op = (unsigned int *) data;
+
+ if (*op < IP_SET_OP_VERSION) {
+ /* Check the version at the beginning of operations */
+ struct ip_set_req_version *req_version = data;
+ if (req_version->version != IPSET_PROTOCOL) {
+ ret = -EPROTO;
+ goto done;
+ }
+ }
+
+ switch (*op) {
+ case IP_SET_OP_VERSION: {
+ struct ip_set_req_version *req_version = data;
+
+ if (*len != sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ req_version->version = IPSET_PROTOCOL;
+ ret = copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version));
+ goto done;
+ }
+ case IP_SET_OP_GET_BYNAME: {
+ struct ip_set_req_get_set *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_FNAME: {
+ struct ip_set_req_get_set_family *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set_family)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ if (id != IPSET_INVALID_ID)
+ req_get->family = ip_set(inst, id)->family;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_BYINDEX: {
+ struct ip_set_req_get_set *req_get = data;
+ struct ip_set *set;
+
+ if (*len != sizeof(struct ip_set_req_get_set) ||
+ req_get->set.index >= inst->ip_set_max) {
+ ret = -EINVAL;
+ goto done;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, req_get->set.index);
+ strncpy(req_get->set.name, set ? set->name : "",
+ IPSET_MAXNAMELEN);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ default:
+ ret = -EBADMSG;
+ goto done;
+ } /* end of switch(op) */
+
+copy:
+ ret = copy_to_user(user, data, copylen);
+
+done:
+ vfree(data);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+ .pf = PF_INET,
+ .get_optmin = SO_IP_SET,
+ .get_optmax = SO_IP_SET + 1,
+ .get = &ip_set_sockfn_get,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init
+ip_set_net_init(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set **list;
+
+ inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
+ if (inst->ip_set_max >= IPSET_INVALID_ID)
+ inst->ip_set_max = IPSET_INVALID_ID - 1;
+
+ list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+ if (!list)
+ return -ENOMEM;
+ inst->is_deleted = 0;
+ rcu_assign_pointer(inst->ip_set_list, list);
+ return 0;
+}
+
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+}
+
+static struct pernet_operations ip_set_net_ops = {
+ .init = ip_set_net_init,
+ .exit = ip_set_net_exit,
+ .id = &ip_set_net_id,
+ .size = sizeof(struct ip_set_net)
+};
+
+
+static int __init
+ip_set_init(void)
+{
+ int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+ if (ret != 0) {
+ pr_err("ip_set: cannot register with nfnetlink.\n");
+ return ret;
+ }
+ ret = nf_register_sockopt(&so_set);
+ if (ret != 0) {
+ pr_err("SO_SET registry failed: %d\n", ret);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ ret = register_pernet_subsys(&ip_set_net_ops);
+ if (ret) {
+ pr_err("ip_set: cannot register pernet_subsys.\n");
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
+ return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+ unregister_pernet_subsys(&ip_set_net_ops);
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 00000000000..29fb01ddff9
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,174 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/sctp.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/export.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+ bool src, __be16 *port, u8 *proto)
+{
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? th->source : th->dest;
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_sctphdr_t _sh;
+ const sctp_sctphdr_t *sh;
+
+ sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+ if (sh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? sh->source : sh->dest;
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? uh->source : uh->dest;
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _ich;
+ const struct icmphdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)htons((ic->type << 8) | ic->code);
+ break;
+ }
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _ich;
+ const struct icmp6hdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)
+ htons((ic->icmp6_type << 8) | ic->icmp6_code);
+ break;
+ }
+ default:
+ break;
+ }
+ *proto = protocol;
+
+ return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int protooff = ip_hdrlen(skb);
+ int protocol = iph->protocol;
+
+ /* See comments at tcp_match in ip_tables.c */
+ if (protocol <= 0)
+ return false;
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_ICMP:
+ /* Port info not available for fragment offset > 0 */
+ return false;
+ default:
+ /* Other protocols doesn't have ports,
+ so we can match fragments */
+ *proto = protocol;
+ return true;
+ }
+
+ return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ int protoff;
+ u8 nexthdr;
+ __be16 frag_off = 0;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return false;
+
+ return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case NFPROTO_IPV6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
new file mode 100644
index 00000000000..61c7fb05280
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -0,0 +1,1160 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _IP_SET_HASH_GEN_H
+#define _IP_SET_HASH_GEN_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#ifndef rcu_dereference_bh
+#define rcu_dereference_bh(p) rcu_dereference(p)
+#endif
+
+#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE 4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h) ((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+ u32 n;
+
+ if (multi < curr)
+ return curr;
+
+ n = curr + AHASH_INIT_SIZE;
+ /* Currently, at listing one hash bucket must fit into a message.
+ * Therefore we have a hard limit here.
+ */
+ return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi) \
+ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h) AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
+
+/* A hash bucket */
+struct hbucket {
+ void *value; /* the array of the values */
+ u8 size; /* size of the array */
+ u8 pos; /* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+ u8 htable_bits; /* size of hash table == 2^htable_bits */
+ struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i) (&((h)->bucket[i]))
+
+#ifndef IPSET_NET_COUNT
+#define IPSET_NET_COUNT 1
+#endif
+
+/* Book-keeping of the prefixes added to the set */
+struct net_prefixes {
+ u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */
+ u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */
+};
+
+/* Compute the hash table size */
+static size_t
+htable_size(u8 hbits)
+{
+ size_t hsize;
+
+ /* We must fit both into u32 in jhash and size_t */
+ if (hbits > 31)
+ return 0;
+ hsize = jhash_size(hbits);
+ if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ < hsize)
+ return 0;
+
+ return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+}
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+ /* Assume that hashsize == 2^htable_bits */
+ u8 bits = fls(hashsize - 1);
+ if (jhash_size(bits) != hashsize)
+ /* Round up to the first 2^n value */
+ bits = fls(hashsize);
+
+ return bits;
+}
+
+static int
+hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
+{
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= ahash_max)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ return 0;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+#if IPSET_NET_COUNT > 1
+#define __CIDR(cidr, i) (cidr[i])
+#else
+#define __CIDR(cidr, i) (cidr)
+#endif
+#ifdef IP_SET_HASH_WITH_NETS_PACKED
+/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */
+#define CIDR(cidr, i) (__CIDR(cidr, i) + 1)
+#else
+#define CIDR(cidr, i) (__CIDR(cidr, i))
+#endif
+
+#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
+
+#ifdef IP_SET_HASH_WITH_MULTI
+#define NLEN(family) (SET_HOST_MASK(family) + 1)
+#else
+#define NLEN(family) SET_HOST_MASK(family)
+#endif
+
+#else
+#define NLEN(family) 0
+#endif /* IP_SET_HASH_WITH_NETS */
+
+#endif /* _IP_SET_HASH_GEN_H */
+
+/* Family dependent templates */
+
+#undef ahash_data
+#undef mtype_data_equal
+#undef mtype_do_data_match
+#undef mtype_data_set_flags
+#undef mtype_data_reset_flags
+#undef mtype_data_netmask
+#undef mtype_data_list
+#undef mtype_data_next
+#undef mtype_elem
+
+#undef mtype_ahash_destroy
+#undef mtype_ext_cleanup
+#undef mtype_add_cidr
+#undef mtype_del_cidr
+#undef mtype_ahash_memsize
+#undef mtype_flush
+#undef mtype_destroy
+#undef mtype_gc_init
+#undef mtype_same_set
+#undef mtype_kadt
+#undef mtype_uadt
+#undef mtype
+
+#undef mtype_add
+#undef mtype_del
+#undef mtype_test_cidrs
+#undef mtype_test
+#undef mtype_expire
+#undef mtype_resize
+#undef mtype_head
+#undef mtype_list
+#undef mtype_gc
+#undef mtype_gc_init
+#undef mtype_variant
+#undef mtype_data_match
+
+#undef HKEY
+
+#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
+#ifdef IP_SET_HASH_WITH_NETS
+#define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match)
+#else
+#define mtype_do_data_match(d) 1
+#endif
+#define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem)
+#define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
+#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype MTYPE
+
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
+#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
+#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
+
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN sizeof(struct mtype_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits) \
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+ & jhash_mask(htable_bits))
+
+#ifndef htype
+#define htype HTYPE
+
+/* The generic hash structure */
+struct htype {
+ struct htable __rcu *table; /* the hash table */
+ u32 maxelem; /* max elements in the hash */
+ u32 elements; /* current element (vs timeout) */
+ u32 initval; /* random jhash init value */
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask; /* markmask value for mark mask to store */
+#endif
+ struct timer_list gc; /* garbage collection when timeout enabled */
+ struct mtype_elem next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+ u8 ahash_max; /* max elements in an array block */
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask; /* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+ struct rb_root rbtree;
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+ struct net_prefixes nets[0]; /* book-keeping of prefixes */
+#endif
+};
+#endif
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ int i, j;
+
+ /* Add in increasing prefix order, so larger cidr first */
+ for (i = 0, j = -1; i < nets_length && h->nets[i].nets[n]; i++) {
+ if (j != -1)
+ continue;
+ else if (h->nets[i].cidr[n] < cidr)
+ j = i;
+ else if (h->nets[i].cidr[n] == cidr) {
+ h->nets[i].nets[n]++;
+ return;
+ }
+ }
+ if (j != -1) {
+ for (; i > j; i--) {
+ h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
+ h->nets[i].nets[n] = h->nets[i - 1].nets[n];
+ }
+ }
+ h->nets[i].cidr[n] = cidr;
+ h->nets[i].nets[n] = 1;
+}
+
+static void
+mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ u8 i, j, net_end = nets_length - 1;
+
+ for (i = 0; i < nets_length; i++) {
+ if (h->nets[i].cidr[n] != cidr)
+ continue;
+ if (h->nets[i].nets[n] > 1 || i == net_end ||
+ h->nets[i + 1].nets[n] == 0) {
+ h->nets[i].nets[n]--;
+ return;
+ }
+ for (j = i; j < net_end && h->nets[j].nets[n]; j++) {
+ h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+ h->nets[j].nets[n] = h->nets[j + 1].nets[n];
+ }
+ h->nets[j].nets[n] = 0;
+ return;
+ }
+}
+#endif
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_ahash_memsize(const struct htype *h, const struct htable *t,
+ u8 nets_length, size_t dsize)
+{
+ u32 i;
+ size_t memsize = sizeof(*h)
+ + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+ + sizeof(struct net_prefixes) * nets_length
+#endif
+ + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++)
+ memsize += t->bucket[i].size * dsize;
+
+ return memsize;
+}
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize) \
+ ((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+static void
+mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
+{
+ int i;
+
+ for (i = 0; i < n->pos; i++)
+ ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct hbucket *n;
+ u32 i;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ n->size = n->pos = 0;
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+#ifdef IP_SET_HASH_WITH_NETS
+ memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
+#endif
+ h->elements = 0;
+}
+
+/* Destroy the hashtable part of the set */
+static void
+mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
+{
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+
+ ip_set_free(t);
+}
+
+/* Destroy a hash type of set */
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (set->extensions & IPSET_EXT_TIMEOUT)
+ del_timer_sync(&h->gc);
+
+ mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);
+#ifdef IP_SET_HASH_WITH_RBTREE
+ rbtree_destroy(&h->rbtree);
+#endif
+ kfree(h);
+
+ set->data = NULL;
+}
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct htype *h = set->data;
+
+ init_timer(&h->gc);
+ h->gc.data = (unsigned long) set;
+ h->gc.function = gc;
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+ pr_debug("gc initialized, run in every %u\n",
+ IPSET_GC_PERIOD(set->timeout));
+}
+
+static bool
+mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct htype *x = a->data;
+ const struct htype *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ a->timeout == b->timeout &&
+#ifdef IP_SET_HASH_WITH_NETMASK
+ x->netmask == y->netmask &&
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ x->markmask == y->markmask &&
+#endif
+ a->extensions == b->extensions;
+}
+
+/* Delete expired elements from the hashtable */
+static void
+mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
+{
+ struct htable *t;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ u32 i;
+ int j;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 k;
+#endif
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, dsize);
+ if (ip_set_timeout_expired(ext_timeout(data, set))) {
+ pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (k = 0; k < IPSET_NET_COUNT; k++)
+ mtype_del_cidr(h, CIDR(data->cidr, k),
+ nets_length, k);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (j != n->pos - 1)
+ /* Not last one */
+ memcpy(data,
+ ahash_data(n, n->pos - 1, dsize),
+ dsize);
+ n->pos--;
+ h->elements--;
+ }
+ }
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ /* Still try to delete expired elements */
+ continue;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ }
+ rcu_read_unlock_bh();
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct htype *h = set->data;
+
+ pr_debug("called\n");
+ write_lock_bh(&set->lock);
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+mtype_resize(struct ip_set *set, bool retried)
+{
+ struct htype *h = set->data;
+ struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);
+ u8 htable_bits = orig->htable_bits;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 flags;
+#endif
+ struct mtype_elem *data;
+ struct mtype_elem *d;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+ /* Try to cleanup once */
+ if (SET_WITH_TIMEOUT(set) && !retried) {
+ i = h->elements;
+ write_lock_bh(&set->lock);
+ mtype_expire(set, set->data, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+ if (h->elements < i)
+ return 0;
+ }
+
+retry:
+ ret = 0;
+ htable_bits++;
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
+ if (!htable_bits) {
+ /* In case we have plenty of memory :-) */
+ pr_warning("Cannot increase the hashsize of set %s further\n",
+ set->name);
+ return -IPSET_ERR_HASH_FULL;
+ }
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ flags = 0;
+ mtype_data_reset_flags(data, &flags);
+#endif
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);
+ if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(data, &flags);
+#endif
+ read_unlock_bh(&set->lock);
+ mtype_ahash_destroy(set, t, false);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ d = ahash_data(m, m->pos++, set->dsize);
+ memcpy(d, data, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(d, &flags);
+#endif
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+ orig->htable_bits, orig, t->htable_bits, t);
+ mtype_ahash_destroy(set, orig, false);
+
+ return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = 0;
+ int j = AHASH_MAX(h) + 1;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 key, multi = 0;
+
+ if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) {
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t,key);
+ if (n->pos) {
+ /* Choosing the first entry in the array to replace */
+ j = 0;
+ goto reuse_slot;
+ }
+ rcu_read_unlock_bh();
+ }
+ if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+
+ if (h->elements >= h->maxelem) {
+ if (net_ratelimit())
+ pr_warning("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
+ }
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi)) {
+ if (flag_exist ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ /* Just the extensions could be overwritten */
+ j = i;
+ goto reuse_slot;
+ } else {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+ }
+ /* Reuse first timed out entry */
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)) &&
+ j != AHASH_MAX(h) + 1)
+ j = i;
+ }
+reuse_slot:
+ if (j != AHASH_MAX(h) + 1) {
+ /* Fill out reused slot */
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++) {
+ mtype_del_cidr(h, CIDR(data->cidr, i),
+ NLEN(set->family), i);
+ mtype_add_cidr(h, CIDR(d->cidr, i),
+ NLEN(set->family), i);
+ }
+#endif
+ ip_set_ext_destroy(set, data);
+ } else {
+ /* Use/create a new slot */
+ TUNE_AHASH_MAX(h, multi);
+ ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);
+ if (ret != 0) {
+ if (ret == -EAGAIN)
+ mtype_data_next(&h->next, d);
+ goto out;
+ }
+ data = ahash_data(n, n->pos++, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_add_cidr(h, CIDR(d->cidr, i), NLEN(set->family),
+ i);
+#endif
+ h->elements++;
+ }
+ memcpy(data, d, sizeof(struct mtype_elem));
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_set_flags(data, flags);
+#endif
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(data, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(data, set), ext);
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = -IPSET_ERR_EXIST;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 j;
+#endif
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))
+ goto out;
+ if (i != n->pos - 1)
+ /* Not last one */
+ memcpy(data, ahash_data(n, n->pos - 1, set->dsize),
+ set->dsize);
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ for (j = 0; j < IPSET_NET_COUNT; j++)
+ mtype_del_cidr(h, CIDR(d->cidr, j), NLEN(set->family),
+ j);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * set->dsize,
+ GFP_ATOMIC);
+ if (!tmp) {
+ ret = 0;
+ goto out;
+ }
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * set->dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ ret = 0;
+ goto out;
+ }
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+static inline int
+mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, struct ip_set *set, u32 flags)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(data, set),
+ ext, mext, flags);
+ return mtype_do_data_match(data);
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = rcu_dereference_bh(h->table);
+ struct hbucket *n;
+ struct mtype_elem *data;
+#if IPSET_NET_COUNT == 2
+ struct mtype_elem orig = *d;
+ int i, j = 0, k;
+#else
+ int i, j = 0;
+#endif
+ u32 key, multi = 0;
+ u8 nets_length = NLEN(set->family);
+
+ pr_debug("test by nets\n");
+ for (; j < nets_length && h->nets[j].nets[0] && !multi; j++) {
+#if IPSET_NET_COUNT == 2
+ mtype_data_reset_elem(d, &orig);
+ mtype_data_netmask(d, h->nets[j].cidr[0], false);
+ for (k = 0; k < nets_length && h->nets[k].nets[1] && !multi;
+ k++) {
+ mtype_data_netmask(d, h->nets[k].cidr[1], true);
+#else
+ mtype_data_netmask(d, h->nets[j].cidr[0]);
+#endif
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set)) {
+ if (!ip_set_timeout_expired(
+ ext_timeout(data, set)))
+ return mtype_data_match(data, ext,
+ mext, set,
+ flags);
+#ifdef IP_SET_HASH_WITH_MULTI
+ multi = 0;
+#endif
+ } else
+ return mtype_data_match(data, ext,
+ mext, set, flags);
+ }
+#if IPSET_NET_COUNT == 2
+ }
+#endif
+ }
+ return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct mtype_elem *d = value;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ int i, ret = 0;
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+#ifdef IP_SET_HASH_WITH_NETS
+ /* If we test an IP address and not a network address,
+ * try all possible network sizes */
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ if (CIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+ break;
+ if (i == IPSET_NET_COUNT) {
+ ret = mtype_test_cidrs(set, d, ext, mext, flags);
+ goto out;
+ }
+#endif
+
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi) &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ ret = mtype_data_match(data, ext, mext, set, flags);
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t;
+ struct nlattr *nested;
+ size_t memsize;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
+ htonl(jhash_size(t->htable_bits))) ||
+ nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
+ goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (h->netmask != HOST_MASK &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ goto nla_put_failure;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
+ goto nla_put_failure;
+#endif
+ if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct mtype_elem *e;
+ u32 first = cb->args[IPSET_CB_ARG0];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ pr_debug("list hash set %s\n", set->name);
+ for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
+ cb->args[IPSET_CB_ARG0]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ pr_debug("cb->arg bucket: %lu, t %p n %p\n",
+ cb->args[IPSET_CB_ARG0], t, n);
+ for (i = 0; i < n->pos; i++) {
+ e = ahash_data(n, i, set->dsize);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+ cb->args[IPSET_CB_ARG0], n, i, e);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[IPSET_CB_ARG0] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_data_list(skb, e))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
+ pr_warning("Can't list set %s: one bucket does not fit into "
+ "a message. Please report it!\n", set->name);
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static int
+IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
+
+static int
+IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+
+static const struct ip_set_type_variant mtype_variant = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .resize = mtype_resize,
+ .same_set = mtype_same_set,
+};
+
+#ifdef IP_SET_EMIT_CREATE
+static int
+IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
+ struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask;
+#endif
+ u8 hbits;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask;
+#endif
+ size_t hsize;
+ struct HTYPE *h;
+ struct htable *t;
+
+ if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ markmask = 0xffffffff;
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+#endif
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
+#endif
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
+ (set->family == NFPROTO_IPV6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (tb[IPSET_ATTR_MARKMASK]) {
+ markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+
+ if ((markmask > 4294967295u) || markmask == 0)
+ return -IPSET_ERR_INVALID_MARKMASK;
+ }
+#endif
+
+ hsize = sizeof(*h);
+#ifdef IP_SET_HASH_WITH_NETS
+ hsize += sizeof(struct net_prefixes) *
+ (set->family == NFPROTO_IPV4 ? 32 : 128);
+#endif
+ h = kzalloc(hsize, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ h->markmask = markmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ hsize = htable_size(hbits);
+ if (hsize == 0) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t = ip_set_alloc(hsize);
+ if (!t) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t->htable_bits = hbits;
+ rcu_assign_pointer(h->table, t);
+
+ set->data = h;
+ if (set->family == NFPROTO_IPV4) {
+ set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+ } else {
+ set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ if (set->family == NFPROTO_IPV4)
+ IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 4_gc));
+ else
+ IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 6_gc));
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(t->htable_bits),
+ t->htable_bits, h->maxelem, set->data, t);
+
+ return 0;
+}
+#endif /* IP_SET_EMIT_CREATE */
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 00000000000..dd40607f878
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,315 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support */
+/* 2 Comments support */
+#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ip
+#define IP_SET_HASH_WITH_NETMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ip4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ __be32 ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *e1,
+ const struct hash_ip4_elem *e2,
+ u32 *multi)
+{
+ return e1->ip == e2->ip;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
+{
+ next->ip = e->ip;
+}
+
+#define MTYPE hash_ip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be32 ip;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
+ ip &= ip_set_netmask(h->netmask);
+ if (ip == 0)
+ return -EINVAL;
+
+ e.ip = ip;
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, hosts;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ip &= ip_set_hostmask(h->netmask);
+
+ if (adt == IPSET_TEST) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip += hosts) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+/* Member elements */
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+ const struct hash_ip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
+}
+
+static inline void
+hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip6_netmask(ip, prefix);
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ip6
+#define PF 6
+#define HOST_MASK 128
+
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -IPSET_ERR_HASH_ELEM;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+ .name = "hash:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+ return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+ ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
new file mode 100644
index 00000000000..4eff0a29725
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -0,0 +1,321 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mark type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
+IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mark");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipmark
+#define IP_SET_HASH_WITH_MARKMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmark4_elem {
+ __be32 ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
+ const struct hash_ipmark4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark4_data_list(struct sk_buff *skb,
+ const struct hash_ipmark4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_ipmark4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipmark6_elem {
+ union nf_inet_addr ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
+ const struct hash_ipmark6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark6_data_list(struct sk_buff *skb,
+ const struct hash_ipmark6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipmark6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+
+static int
+hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static struct ip_set_type hash_ipmark_type __read_mostly = {
+ .name = "hash:ip,mark",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MARK,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipmark_create,
+ .create_policy = {
+ [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_MARK] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipmark_init(void)
+{
+ return ip_set_type_register(&hash_ipmark_type);
+}
+
+static void __exit
+hash_ipmark_fini(void)
+{
+ ip_set_type_unregister(&hash_ipmark_type);
+}
+
+module_init(hash_ipmark_init);
+module_exit(hash_ipmark_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 00000000000..7597b82a8b0
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,390 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipport
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+ const struct hash_ipport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport4_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_ipport4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+ const struct hash_ipport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport6_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipport6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+ .name = "hash:ip,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+ return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+ ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 00000000000..672655ffd57
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,402 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportip
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+ const struct hash_ipportip4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+/* Common functions */
+#define MTYPE hash_ipportip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+ const struct hash_ipportip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportip6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+ .name = "hash:ip,port,ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+ return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 00000000000..7308d84f927
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,561 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportnet
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportnet4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+ const struct hash_ipportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+ elem->ip2 &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+ next->ip2 = d->ip2;
+}
+
+#define MTYPE hash_ipportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ e.ip2 &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports ||
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip = htonl(ip);
+ e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (ip2_from + UINT_MAX == ip2_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = retried &&
+ ip == ntohl(h->next.ip) &&
+ p == ntohs(h->next.port)
+ ? ntohl(h->next.ip2) : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip2 = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr);
+ e.cidr = cidr - 1;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+ const struct hash_ipportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip2, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+ .name = "hash:ip,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+ return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 00000000000..4c7d495783a
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,397 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Range as input support for IPv4 added */
+/* 2 nomatch flag support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define HTYPE hash_net
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+ const struct hash_net4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net4_do_data_match(const struct hash_net4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net4_data_next(struct hash_net4_elem *next,
+ const struct hash_net4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_net4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ }
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+ const struct hash_net6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net6_do_data_match(const struct hash_net6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net6_data_next(struct hash_net4_elem *next,
+ const struct hash_net6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_net6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+ .name = "hash:net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_net_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+ return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+ ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
new file mode 100644
index 00000000000..db2606805b3
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -0,0 +1,610 @@
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,iface type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 nomatch flag support added */
+/* 2 /0 support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,iface");
+
+/* Interface name rbtree */
+
+struct iface_node {
+ struct rb_node node;
+ char iface[IFNAMSIZ];
+};
+
+#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface)
+
+static void
+rbtree_destroy(struct rb_root *root)
+{
+ struct iface_node *node, *next;
+
+ rbtree_postorder_for_each_entry_safe(node, next, root, node)
+ kfree(node);
+
+ *root = RB_ROOT;
+}
+
+static int
+iface_test(struct rb_root *root, const char **iface)
+{
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ const char *d = iface_data(n);
+ int res = strcmp(*iface, d);
+
+ if (res < 0)
+ n = n->rb_left;
+ else if (res > 0)
+ n = n->rb_right;
+ else {
+ *iface = d;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+iface_add(struct rb_root *root, const char **iface)
+{
+ struct rb_node **n = &(root->rb_node), *p = NULL;
+ struct iface_node *d;
+
+ while (*n) {
+ char *ifname = iface_data(*n);
+ int res = strcmp(*iface, ifname);
+
+ p = *n;
+ if (res < 0)
+ n = &((*n)->rb_left);
+ else if (res > 0)
+ n = &((*n)->rb_right);
+ else {
+ *iface = ifname;
+ return 0;
+ }
+ }
+
+ d = kzalloc(sizeof(*d), GFP_ATOMIC);
+ if (!d)
+ return -ENOMEM;
+ strcpy(d->iface, *iface);
+
+ rb_link_node(&d->node, p, n);
+ rb_insert_color(&d->node, root);
+
+ *iface = d->iface;
+ return 0;
+}
+
+/* Type specific function prefix */
+#define HTYPE hash_netiface
+#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_RBTREE
+#define IP_SET_HASH_WITH_MULTI
+
+#define STREQ(a, b) (strcmp(a, b) == 0)
+
+/* IPv4 variant */
+
+struct hash_netiface4_elem_hashed {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+/* Member elements */
+struct hash_netiface4_elem {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
+ const struct hash_netiface4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface4_data_list(struct sk_buff *skb,
+ const struct hash_netiface4_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface4_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_netiface4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+#define IFACE(dir) (par->dir ? par->dir->name : NULL)
+#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL)
+#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+ const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+ if (!nf_bridge)
+ return -EINVAL;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netiface6_elem_hashed {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+struct hash_netiface6_elem {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
+ const struct hash_netiface6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface6_data_list(struct sk_buff *skb,
+ const struct hash_netiface6_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface6_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_netiface6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#ifdef CONFIG_BRIDGE_NETFILTER
+ const struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+ if (!nf_bridge)
+ return -EINVAL;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip6_netmask(&e.ip, e.cidr);
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netiface_type __read_mostly = {
+ .name = "hash:net,iface",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netiface_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netiface_init(void)
+{
+ return ip_set_type_register(&hash_netiface_type);
+}
+
+static void __exit
+hash_netiface_fini(void)
+{
+ ip_set_type_unregister(&hash_netiface_type);
+}
+
+module_init(hash_netiface_init);
+module_exit(hash_netiface_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
new file mode 100644
index 00000000000..3e99987e4bf
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -0,0 +1,481 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netnet
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variants */
+
+/* Member elements */
+struct hash_netnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ u8 nomatch;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
+ const struct hash_netnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
+ struct hash_netnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet4_data_list(struct sk_buff *skb,
+ const struct hash_netnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet4_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+}
+
+#define MTYPE hash_netnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr2 || cidr2 > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr2;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] &&
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_to < ip2_from)
+ swap(ip2_from, ip2_to);
+ if (ip2_from + UINT_MAX == ip2_to)
+ return -IPSET_ERR_HASH_RANGE;
+
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ ip2 = (retried &&
+ ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = last2 + 1;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variants */
+
+struct hash_netnet6_elem {
+ union nf_inet_addr ip[2];
+ u8 nomatch;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
+ const struct hash_netnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
+ struct hash_netnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet6_data_list(struct sk_buff *skb,
+ const struct hash_netnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet6_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netnet_type __read_mostly = {
+ .name = "hash:net,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netnet_init(void)
+{
+ return ip_set_type_register(&hash_netnet_type);
+}
+
+static void __exit
+hash_netnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netnet_type);
+}
+
+module_init(hash_netnet_init);
+module_exit(hash_netnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 00000000000..1c645fbd09c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,509 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_netport
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+ const struct hash_netport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport4_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netport4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, last;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = port_to = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port_to < port)
+ swap(port, port_to);
+ }
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr = cidr - 1;
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+ const struct hash_netport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport6_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netport6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+ .name = "hash:net,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+ return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+ ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
new file mode 100644
index 00000000000..c0d2ba73f8b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -0,0 +1,587 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 0 Comments support added */
+#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netportnet
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netportnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
+ const struct hash_netportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
+ struct hash_netportnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet4_data_list(struct sk_buff *skb,
+ const struct hash_netportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ if (unlikely(ip + UINT_MAX == ip_to))
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (unlikely(ip2_from + UINT_MAX == ip2_to))
+ return -IPSET_ERR_HASH_RANGE;
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
+ p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ ip = ip_last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netportnet6_elem {
+ union nf_inet_addr ip[2];
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
+ const struct hash_netportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
+ struct hash_netportnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet6_data_list(struct sk_buff *skb,
+ const struct hash_netportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK))
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netportnet_type __read_mostly = {
+ .name = "hash:net,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netportnet_init(void)
+{
+ return ip_set_type_register(&hash_netportnet_type);
+}
+
+static void __exit
+hash_netportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netportnet_type);
+}
+
+module_init(hash_netportnet_init);
+module_exit(hash_netportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 00000000000..3e2317f3cf6
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,685 @@
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support added */
+#define IPSET_TYPE_REV_MAX 2 /* Comments support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements */
+struct set_elem {
+ ip_set_id_t id;
+};
+
+struct set_adt_elem {
+ ip_set_id_t id;
+ ip_set_id_t refid;
+ int before;
+};
+
+/* Type structure */
+struct list_set {
+ u32 size; /* size of set list array */
+ struct timer_list gc; /* garbage collection */
+ struct net *net; /* namespace */
+ struct set_elem members[0]; /* the set members */
+};
+
+#define list_set_elem(set, map, id) \
+ (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)
+
+static int
+list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i, cmdflags = opt->cmdflags;
+ int ret;
+
+ /* Don't lookup sub-counters at all */
+ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
+ if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
+ opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_test(e->id, skb, par, opt);
+ if (ret > 0) {
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(e, set),
+ ext, &opt->ext,
+ cmdflags);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int
+list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_add(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_del(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ switch (adt) {
+ case IPSET_TEST:
+ return list_set_ktest(set, skb, par, opt, &ext);
+ case IPSET_ADD:
+ return list_set_kadd(set, skb, par, opt, &ext);
+ case IPSET_DEL:
+ return list_set_kdel(set, skb, par, opt, &ext);
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static bool
+id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
+{
+ const struct list_set *map = set->data;
+ const struct set_elem *e;
+
+ if (i >= map->size)
+ return 0;
+
+ e = list_set_elem(set, map, i);
+ return !!(e->id == id &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set))));
+}
+
+static int
+list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
+ const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ if (e->id != IPSET_INVALID_ID) {
+ if (i == map->size - 1) {
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ } else {
+ struct set_elem *x = list_set_elem(set, map,
+ map->size - 1);
+
+ /* Last element pushed off */
+ if (x->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, x->id);
+ ip_set_ext_destroy(set, x);
+ }
+ memmove(list_set_elem(set, map, i + 1), e,
+ set->dsize * (map->size - (i + 1)));
+ /* Extensions must be initialized to zero */
+ memset(e, 0, set->dsize);
+ }
+ }
+
+ e->id = d->id;
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ return 0;
+}
+
+static int
+list_set_del(struct ip_set *set, u32 i)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+
+ if (i < map->size - 1)
+ memmove(e, list_set_elem(set, map, i + 1),
+ set->dsize * (map->size - (i + 1)));
+
+ /* Last element */
+ e = list_set_elem(set, map, map->size - 1);
+ e->id = IPSET_INVALID_ID;
+ return 0;
+}
+
+static void
+set_cleanup_entries(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i = 0;
+
+ while (i < map->size) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ list_set_del(set, i);
+ /* Check element moved to position i in next loop */
+ else
+ i++;
+ }
+}
+
+static int
+list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return 1;
+ else if (d->before > 0)
+ ret = id_eq(set, i + 1, d->refid);
+ else
+ ret = i > 0 && id_eq(set, i - 1, d->refid);
+ return ret;
+ }
+ return 0;
+}
+
+
+static int
+list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 i, ret = 0;
+
+ if (SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ /* Check already added element */
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto insert;
+ else if (e->id != d->id)
+ continue;
+
+ if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
+ (d->before < 0 &&
+ (i == 0 || !id_eq(set, i - 1, d->refid))))
+ /* Before/after doesn't match */
+ return -IPSET_ERR_REF_EXIST;
+ if (!flag_exist)
+ /* Can't re-add */
+ return -IPSET_ERR_EXIST;
+ /* Update extensions */
+ ip_set_ext_destroy(set, e);
+
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ /* Set is already added to the list */
+ ip_set_put_byindex(map->net, d->id);
+ return 0;
+ }
+insert:
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(set, i, d, ext);
+ else if (e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ ret = list_set_add(set, i, d, ext);
+ else if (i + 1 < map->size)
+ ret = list_set_add(set, i + 1, d, ext);
+ }
+
+ return ret;
+}
+
+static int
+list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return list_set_del(set, i);
+ else if (d->before > 0) {
+ if (!id_eq(set, i + 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ return list_set_del(set, i);
+ } else if (i == 0 || !id_eq(set, i - 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ else
+ return list_set_del(set, i);
+ }
+ return -IPSET_ERR_EXIST;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct list_set *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct set_adt_elem e = { .refid = IPSET_INVALID_ID };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ struct ip_set *s;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (e.id == IPSET_INVALID_ID)
+ return -IPSET_ERR_NAME;
+ /* "Loop detection" */
+ if (s->type->features & IPSET_TYPE_NAME) {
+ ret = -IPSET_ERR_LOOP;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ e.before = f & IPSET_FLAG_BEFORE;
+ }
+
+ if (e.before && !tb[IPSET_ATTR_NAMEREF]) {
+ ret = -IPSET_ERR_BEFORE;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_NAMEREF]) {
+ e.refid = ip_set_get_byname(map->net,
+ nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (e.refid == IPSET_INVALID_ID) {
+ ret = -IPSET_ERR_NAMEREF;
+ goto finish;
+ }
+ if (!e.before)
+ e.before = -1;
+ }
+ if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+finish:
+ if (e.refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(map->net, e.refid);
+ if (adt != IPSET_ADD || ret)
+ ip_set_put_byindex(map->net, e.id);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ e->id = IPSET_INVALID_ID;
+ }
+ }
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+ list_set_flush(set);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->size * set->dsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 i, first = cb->args[IPSET_CB_ARG0];
+ const struct set_elem *e;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->size;
+ cb->args[IPSET_CB_ARG0]++) {
+ i = cb->args[IPSET_CB_ARG0];
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto finish;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (i == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (nla_put_string(skb, IPSET_ATTR_NAME,
+ ip_set_name_byindex(map->net, e->id)))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+finish:
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(i == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct list_set *x = a->data;
+ const struct list_set *y = b->data;
+
+ return x->size == y->size &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+static const struct ip_set_type_variant set_variant = {
+ .kadt = list_set_kadt,
+ .uadt = list_set_uadt,
+ .adt = {
+ [IPSET_ADD] = list_set_uadd,
+ [IPSET_DEL] = list_set_udel,
+ [IPSET_TEST] = list_set_utest,
+ },
+ .destroy = list_set_destroy,
+ .flush = list_set_flush,
+ .head = list_set_head,
+ .list = list_set_list,
+ .same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct list_set *map = set->data;
+
+ write_lock_bh(&set->lock);
+ set_cleanup_entries(set);
+ write_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct list_set *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct net *net, struct ip_set *set, u32 size)
+{
+ struct list_set *map;
+ struct set_elem *e;
+ u32 i;
+
+ map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL);
+ if (!map)
+ return false;
+
+ map->size = size;
+ map->net = net;
+ set->data = map;
+
+ for (i = 0; i < size; i++) {
+ e = list_set_elem(set, map, i);
+ e->id = IPSET_INVALID_ID;
+ }
+
+ return true;
+}
+
+static int
+list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_SIZE])
+ size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+ if (size < IP_SET_LIST_MIN_SIZE)
+ size = IP_SET_LIST_MIN_SIZE;
+
+ set->variant = &set_variant;
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ if (!init_list_set(net, set, size))
+ return -ENOMEM;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ list_set_gc_init(set, list_set_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+ .name = "list:set",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = list_set_create,
+ .create_policy = {
+ [IPSET_ATTR_SIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_NAME] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+ return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+ ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 00000000000..04d15fdc99e
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,313 @@
+#include <linux/export.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+ {.ip6 = { \
+ htonl(a), htonl(b), \
+ htonl(c), htonl(d), \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef E
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32) a, (__force __be32) b, \
+ (__force __be32) c, (__force __be32) d, \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
+
+/* Find the largest network which matches the range from left, in host order. */
+u32
+ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+ u32 last;
+ u8 i;
+
+ for (i = 1; i < 32; i++) {
+ if ((from & ip_set_hostmask(i)) != from)
+ continue;
+ last = from | ~ip_set_hostmask(i);
+ if (!after(last, to)) {
+ *cidr = i;
+ return last;
+ }
+ }
+ *cidr = 32;
+ return from;
+}
+EXPORT_SYMBOL_GPL(ip_set_range_to_cidr);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
new file mode 100644
index 00000000000..0c3b1670b0d
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,281 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+ tristate "IP virtual server support"
+ depends on NET && INET && NETFILTER
+ depends on (NF_CONNTRACK || NF_CONNTRACK=n)
+ ---help---
+ IP Virtual Server support will let you build a high-performance
+ virtual server based on cluster of two or more real servers. This
+ option must be enabled for at least one of the clustered computers
+ that will take care of intercepting incoming connections to a
+ single IP address and scheduling them to real servers.
+
+ Three request dispatching techniques are implemented, they are
+ virtual server via NAT, virtual server via tunneling and virtual
+ server via direct routing. The several scheduling algorithms can
+ be used to choose which server the connection is directed to,
+ thus load balancing can be achieved among the servers. For more
+ information and its administration program, please visit the
+ following URL: <http://www.linuxvirtualserver.org/>.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config IP_VS_IPV6
+ bool "IPv6 support for IPVS"
+ depends on IPV6 = y || IP_VS = IPV6
+ select IP6_NF_IPTABLES
+ ---help---
+ Add IPv6 support to IPVS.
+
+ Say Y if unsure.
+
+config IP_VS_DEBUG
+ bool "IP virtual server debugging"
+ ---help---
+ Say Y here if you want to get additional messages useful in
+ debugging the IP virtual server code. You can change the debug
+ level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+ int "IPVS connection table size (the Nth power of 2)"
+ range 8 20
+ default 12
+ ---help---
+ The IPVS connection hash table uses the chaining scheme to handle
+ hash collisions. Using a big IPVS connection hash table will greatly
+ reduce conflicts when there are hundreds of thousands of connections
+ in the hash table.
+
+ Note the table size must be power of 2. The table size will be the
+ value of 2 to the your input number power. The number to choose is
+ from 8 to 20, the default number is 12, which means the table size
+ is 4096. Don't input the number too small, otherwise you will lose
+ performance on it. You can adapt the table size yourself, according
+ to your virtual server application. It is good to set the table size
+ not far less than the number of connections per second multiplying
+ average lasting time of connection in the table. For example, your
+ virtual server gets 200 connections per second, the connection lasts
+ for 200 seconds in average in the connection table, the table size
+ should be not far less than 200x200, it is good to set the table
+ size 32768 (2**15).
+
+ Another note that each connection occupies 128 bytes effectively and
+ each hash entry uses 8 bytes, so you can estimate how much memory is
+ needed for your box.
+
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
+comment "IPVS transport protocol load balancing support"
+
+config IP_VS_PROTO_TCP
+ bool "TCP load balancing support"
+ ---help---
+ This option enables support for load balancing TCP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+ bool "UDP load balancing support"
+ ---help---
+ This option enables support for load balancing UDP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH_ESP
+ def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
+
+config IP_VS_PROTO_ESP
+ bool "ESP load balancing support"
+ ---help---
+ This option enables support for load balancing ESP (Encapsulation
+ Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+ bool "AH load balancing support"
+ ---help---
+ This option enables support for load balancing AH (Authentication
+ Header) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_SCTP
+ bool "SCTP load balancing support"
+ select LIBCRC32C
+ ---help---
+ This option enables support for load balancing SCTP transport
+ protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config IP_VS_RR
+ tristate "round-robin scheduling"
+ ---help---
+ The robin-robin scheduling algorithm simply directs network
+ connections to different real servers in a round-robin manner.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WRR
+ tristate "weighted round-robin scheduling"
+ ---help---
+ The weighted robin-robin scheduling algorithm directs network
+ connections to different real servers based on server weights
+ in a round-robin manner. Servers with higher weights receive
+ new connections first than those with less weights, and servers
+ with higher weights get more connections than those with less
+ weights and servers with equal weights get equal connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LC
+ tristate "least-connection scheduling"
+ ---help---
+ The least-connection scheduling algorithm directs network
+ connections to the server with the least number of active
+ connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WLC
+ tristate "weighted least-connection scheduling"
+ ---help---
+ The weighted least-connection scheduling algorithm directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLC
+ tristate "locality-based least-connection scheduling"
+ ---help---
+ The locality-based least-connection scheduling algorithm is for
+ destination IP load balancing. It is usually used in cache cluster.
+ This algorithm usually directs packet destined for an IP address to
+ its server if the server is alive and under load. If the server is
+ overloaded (its active connection numbers is larger than its weight)
+ and there is a server in its half load, then allocate the weighted
+ least-connection server to this IP address.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLCR
+ tristate "locality-based least-connection with replication scheduling"
+ ---help---
+ The locality-based least-connection with replication scheduling
+ algorithm is also for destination IP load balancing. It is
+ usually used in cache cluster. It differs from the LBLC scheduling
+ as follows: the load balancer maintains mappings from a target
+ to a set of server nodes that can serve the target. Requests for
+ a target are assigned to the least-connection node in the target's
+ server set. If all the node in the server set are over loaded,
+ it picks up a least-connection node in the cluster and adds it
+ in the sever set for the target. If the server set has not been
+ modified for the specified time, the most loaded node is removed
+ from the server set, in order to avoid high degree of replication.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_DH
+ tristate "destination hashing scheduling"
+ ---help---
+ The destination hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their destination IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SH
+ tristate "source hashing scheduling"
+ ---help---
+ The source hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their source IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SED
+ tristate "shortest expected delay scheduling"
+ ---help---
+ The shortest expected delay scheduling algorithm assigns network
+ connections to the server with the shortest expected delay. The
+ expected delay that the job will experience is (Ci + 1) / Ui if
+ sent to the ith server, in which Ci is the number of connections
+ on the ith server and Ui is the fixed service rate (weight)
+ of the ith server.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NQ
+ tristate "never queue scheduling"
+ ---help---
+ The never queue scheduling algorithm adopts a two-speed model.
+ When there is an idle server available, the job will be sent to
+ the idle server, instead of waiting for a fast one. When there
+ is no idle server available, the job will be sent to the server
+ that minimize its expected delay (The Shortest Expected Delay
+ scheduling algorithm).
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+comment 'IPVS SH scheduler'
+
+config IP_VS_SH_TAB_BITS
+ int "IPVS source hashing table size (the Nth power of 2)"
+ range 4 20
+ default 8
+ ---help---
+ The source hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is tiled by each destination
+ until all slots in the table are filled. When using weights to
+ allow destinations to receive more connections, the table is
+ tiled an amount proportional to the weights specified. The table
+ needs to be large enough to effectively fit all the destinations
+ multiplied by their respective weights.
+
+comment 'IPVS application helper'
+
+config IP_VS_FTP
+ tristate "FTP protocol helper"
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
+ NF_CONNTRACK_FTP
+ select IP_VS_NFCT
+ ---help---
+ FTP is a protocol that transfers IP address and/or port number in
+ the payload. In the virtual server via Network Address Translation,
+ the IP address and port number of real servers cannot be sent to
+ clients in ftp connections directly, so FTP protocol helper is
+ required for tracking the connection and mangling it back to that of
+ virtual service.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
new file mode 100644
index 00000000000..34ee602ddb6
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,40 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
+ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
+ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
+ ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
new file mode 100644
index 00000000000..dfd7b65b3d2
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,627 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ * IP_MASQ_APP application masquerading module
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+/*
+ * Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+ return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+ module_put(app->module);
+}
+
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
+
+/*
+ * Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ struct ip_vs_protocol *pp;
+ struct ip_vs_app *inc;
+ int ret;
+
+ if (!(pp = ip_vs_proto_get(proto)))
+ return -EPROTONOSUPPORT;
+
+ if (!pp->unregister_app)
+ return -EOPNOTSUPP;
+
+ inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+ if (!inc)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&inc->p_list);
+ INIT_LIST_HEAD(&inc->incs_list);
+ inc->app = app;
+ inc->port = htons(port);
+ atomic_set(&inc->usecnt, 0);
+
+ if (app->timeouts) {
+ inc->timeout_table =
+ ip_vs_create_timeout_table(app->timeouts,
+ app->timeouts_size);
+ if (!inc->timeout_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = pp->register_app(net, inc);
+ if (ret)
+ goto out;
+
+ list_add(&inc->a_list, &app->incs_list);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ return 0;
+
+ out:
+ ip_vs_app_inc_destroy(inc);
+ return ret;
+}
+
+
+/*
+ * Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_protocol *pp;
+
+ if (!(pp = ip_vs_proto_get(inc->protocol)))
+ return;
+
+ if (pp->unregister_app)
+ pp->unregister_app(net, inc);
+
+ IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ list_del(&inc->a_list);
+
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
+}
+
+
+/*
+ * Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+ int result;
+
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
+ return result;
+}
+
+
+/*
+ * Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+ atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
+}
+
+
+/*
+ * Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ int result;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ result = ip_vs_app_inc_new(net, app, proto, port);
+
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return result;
+}
+
+
+/* Register application for netns */
+struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a;
+ int err = 0;
+
+ if (!ipvs)
+ return ERR_PTR(-ENOENT);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry(a, &ipvs->app_list, a_list) {
+ if (!strcmp(app->name, a->name)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ a = kmemdup(app, sizeof(*app), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ INIT_LIST_HEAD(&a->incs_list);
+ list_add(&a->a_list, &ipvs->app_list);
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+out_unlock:
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return err ? ERR_PTR(err) : a;
+}
+
+
+/*
+ * ip_vs_app unregistration routine
+ * We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
+ */
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a, *anxt, *inc, *nxt;
+
+ if (!ipvs)
+ return;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
+ if (app && strcmp(app->name, a->name))
+ continue;
+ list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
+ ip_vs_app_inc_release(net, inc);
+ }
+
+ list_del(&a->a_list);
+ kfree(a);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+ }
+
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+
+/*
+ * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ return pp->app_conn_bind(cp);
+}
+
+
+/*
+ * Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+ struct ip_vs_app *inc = cp->app;
+
+ if (!inc)
+ return;
+
+ if (inc->unbind_conn)
+ inc->unbind_conn(inc, cp);
+ if (inc->done_conn)
+ inc->done_conn(inc, cp);
+ ip_vs_app_inc_put(inc);
+ cp->app = NULL;
+}
+
+
+/*
+ * Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ if(after(seq, vseq->init_seq)) {
+ th->seq = htonl(seq + vseq->delta);
+ IP_VS_DBG(9, "%s(): added delta (%d) to seq\n",
+ __func__, vseq->delta);
+ } else {
+ th->seq = htonl(seq + vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 ack_seq = ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ /* since ack_seq is the number of octet that is expected
+ to receive next, so compare it with init_seq+delta */
+ if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+ th->ack_seq = htonl(ack_seq - vseq->delta);
+ IP_VS_DBG(9, "%s(): subtracted delta "
+ "(%d) from ack_seq\n", __func__, vseq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): subtracted "
+ "previous_delta (%d) from ack_seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Updates ip_vs_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+ unsigned int flag, __u32 seq, int diff)
+{
+ /* spinlock is to keep updating cp->flags atomic */
+ spin_lock_bh(&cp->lock);
+ if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+ vseq->previous_delta = vseq->delta;
+ vseq->delta += diff;
+ vseq->init_seq = seq;
+ cp->flags |= flag;
+ }
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_seq(&cp->out_seq, th);
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_ack_seq(&cp->in_seq, th);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ if (!app->pkt_out(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->out_seq,
+ IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Output pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL
+ * returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_out(cp, skb, app);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_seq(&cp->in_seq, th);
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_ack_seq(&cp->out_seq, th);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ if (!app->pkt_in(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->in_seq,
+ IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Input pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL.
+ * returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_in(cp, skb, app);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ * /proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
+{
+ struct ip_vs_app *app, *inc;
+
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ if (pos-- == 0)
+ return inc;
+ }
+ }
+ return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_app *inc, *app;
+ struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_app_idx(ipvs, 0);
+
+ inc = v;
+ app = inc->app;
+
+ if ((e = inc->a_list.next) != &app->incs_list)
+ return list_entry(e, struct ip_vs_app, a_list);
+
+ /* go on to next application */
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
+ app = list_entry(e, struct ip_vs_app, a_list);
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ return inc;
+ }
+ }
+ return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "prot port usecnt name\n");
+ else {
+ const struct ip_vs_app *inc = v;
+
+ seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
+ ip_vs_proto_name(inc->protocol),
+ ntohs(inc->port),
+ atomic_read(&inc->usecnt),
+ inc->name);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+ .start = ip_vs_app_seq_start,
+ .next = ip_vs_app_seq_next,
+ .stop = ip_vs_app_seq_stop,
+ .show = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ip_vs_app_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_app_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+int __net_init ip_vs_app_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_app_net_cleanup(struct net *net)
+{
+ unregister_ip_vs_app(net, NULL /* all */);
+ remove_proc_entry("ip_vs_app", net->proc_net);
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
new file mode 100644
index 00000000000..610e19c0e13
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1376 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h> /* for proc_net_* */
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
+
+/*
+ * Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
+
+/* SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/* counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd __read_mostly;
+
+/*
+ * Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS 5
+#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+ spinlock_t l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_write_lock_bh(unsigned int key)
+{
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned int key)
+{
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ * Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+#endif
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+}
+
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe_data && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
+
+ if (cp->pe) {
+ p.pe = cp->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
+
+/*
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return 0;
+
+ /* Hash by protocol, client address and port */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
+ ret = 1;
+ } else {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ ret = 0;
+ }
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * UNhashes ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success. Caller should hold conn reference.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ /* unhash it and decrease its reference counter */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ atomic_dec(&cp->refcnt);
+ ret = 1;
+ } else
+ ret = 0;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from OUTside-to-INside.
+ * p->caddr, p->cport: pkt source address (foreign host)
+ * p->vaddr, p->vport: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+ ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ rcu_read_unlock();
+ return cp;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ struct ip_vs_conn *cp;
+
+ cp = __ip_vs_conn_in_get(p);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+ struct ip_vs_conn_param cport_zero_p = *p;
+ cport_zero_p.cport = 0;
+ cp = __ip_vs_conn_in_get(&cport_zero_p);
+ }
+
+ IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse, struct ip_vs_conn_param *p)
+{
+ __be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return 1;
+
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
+ else
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
+ return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ continue;
+ }
+
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * p->vaddr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+ p->af, p->vaddr, &cp->vaddr) &&
+ p->vport == cp->vport && p->cport == cp->cport &&
+ cp->flags & IP_VS_CONN_F_TEMPLATE &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ }
+ cp = NULL;
+
+ out:
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * p->caddr, p->cport: pkt source address (inside host)
+ * p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp, *ret=NULL;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_vs_conn_hashkey_param(p, true);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ ret = cp;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+
+/*
+ * Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+ 0 : cp->timeout;
+ mod_timer(&cp->timer, jiffies+t);
+
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+ if (ip_vs_conn_unhash(cp)) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ cp->cport = cport;
+ }
+ spin_unlock_bh(&cp->lock);
+
+ /* hash on new dport */
+ ip_vs_conn_hash(cp);
+ }
+}
+
+
+/*
+ * Bind a connection entry with the corresponding packet_xmit.
+ * Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit;
+ break;
+ }
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit_v6;
+ break;
+ }
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
+
+/*
+ * Bind a connection entry with a virtual service destination
+ * Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+ unsigned int conn_flags;
+ __u32 flags;
+
+ /* if dest is NULL, then return directly */
+ if (!dest)
+ return;
+
+ /* Increase the refcnt counter of the dest */
+ ip_vs_dest_hold(dest);
+
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
+ /* Bind with the destination and its corresponding transmitter */
+ if (flags & IP_VS_CONN_F_SYNC) {
+ /* if the connection is not template and is created
+ * by sync, preserve the activity flag.
+ */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* connections inherit forwarding method from dest */
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
+ }
+ flags |= conn_flags;
+ cp->flags = flags;
+ cp->dest = dest;
+
+ IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
+ atomic_inc(&dest->activeconns);
+ else
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the persistent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock_bh(&cp->lock);
+ if (cp->dest) {
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
+ ip_vs_bind_dest(cp, dest);
+ spin_unlock_bh(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Unbind a connection entry with its VS destination
+ * Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (!dest)
+ return;
+
+ IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->inactconns);
+ } else {
+ atomic_dec(&dest->activeconns);
+ }
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the persistent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ if (dest->l_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else if (dest->u_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ }
+
+ ip_vs_dest_put(dest);
+}
+
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->sysctl_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0);
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Checking if the destination of a connection template is available.
+ * If available, return 1, otherwise invalidate this connection
+ * template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+ struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+
+ /*
+ * Checking the dest server status.
+ */
+ if ((dest == NULL) ||
+ !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+ expire_quiescent_template(ipvs, dest)) {
+ IP_VS_DBG_BUF(9, "check_template: dest not available for "
+ "protocol %s s:%s:%d v:%s:%d "
+ "-> d:%s:%d\n",
+ ip_vs_proto_name(ct->protocol),
+ IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+ ntohs(ct->cport),
+ IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+ ntohs(ct->vport),
+ IP_VS_DBG_ADDR(ct->af, &ct->daddr),
+ ntohs(ct->dport));
+
+ /*
+ * Invalidate the connection template
+ */
+ if (ct->vport != htons(0xffff)) {
+ if (ip_vs_conn_unhash(ct)) {
+ ct->dport = htons(0xffff);
+ ct->vport = htons(0xffff);
+ ct->cport = 0;
+ ip_vs_conn_hash(ct);
+ }
+ }
+
+ /*
+ * Simply decrease the refcnt of the template,
+ * don't restart its timer.
+ */
+ __ip_vs_conn_put(ct);
+ return 0;
+ }
+ return 1;
+}
+
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&cp->n_control))
+ goto expire_later;
+
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
+ /* delete the timer if it is activated by other users */
+ del_timer(&cp->timer);
+
+ /* does anybody control me? */
+ if (cp->control)
+ ip_vs_control_del(cp);
+
+ if (cp->flags & IP_VS_CONN_F_NFCT) {
+ /* Do not access conntracks during subsys cleanup
+ * because nf_conntrack_find_get can not be used after
+ * conntrack cleanup for the net.
+ */
+ smp_rmb();
+ if (ipvs->enable)
+ ip_vs_conn_drop_conntrack(cp);
+ }
+
+ if (unlikely(cp->app != NULL))
+ ip_vs_unbind_app(cp);
+ ip_vs_unbind_dest(cp);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ atomic_dec(&ipvs->conn_count);
+ return;
+ }
+
+ expire_later:
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
+ atomic_read(&cp->n_control));
+
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
+ ip_vs_conn_put(cp);
+}
+
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
+}
+
+
+/*
+ * Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
+ const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
+ struct ip_vs_dest *dest, __u32 fwmark)
+{
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
+
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ if (cp == NULL) {
+ IP_VS_ERR_RL("%s(): no memory\n", __func__);
+ return NULL;
+ }
+
+ INIT_HLIST_NODE(&cp->c_list);
+ setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
+ cp->af = p->af;
+ cp->protocol = p->protocol;
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
+ cp->cport = p->cport;
+ /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->vaddr, p->vaddr);
+ cp->vport = p->vport;
+ ip_vs_addr_set(p->af, &cp->daddr, daddr);
+ cp->dport = dport;
+ cp->flags = flags;
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
+ }
+ spin_lock_init(&cp->lock);
+
+ /*
+ * Set the entry is referenced by the current thread before hashing
+ * it in the table, so that other thread run ip_vs_random_dropentry
+ * but cannot drop this entry.
+ */
+ atomic_set(&cp->refcnt, 1);
+
+ cp->control = NULL;
+ atomic_set(&cp->n_control, 0);
+ atomic_set(&cp->in_pkts, 0);
+
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
+ atomic_inc(&ipvs->conn_count);
+ if (flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+ /* Bind the connection with a destination server */
+ cp->dest = NULL;
+ ip_vs_bind_dest(cp, dest);
+
+ /* Set its state and timeout */
+ cp->state = 0;
+ cp->old_state = 0;
+ cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
+
+ /* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+ if (p->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
+
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled(ipvs))
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
+ /* Hash it in the ip_vs_conn_tab finally */
+ ip_vs_conn_hash(cp);
+
+ return cp;
+}
+
+/*
+ * /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
+ if (pos-- == 0) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ }
+ cond_resched_rcu();
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
+ rcu_read_lock();
+ return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_conn *cp = v;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
+ int idx;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_conn_array(seq, 0);
+
+ /* more on same hash chain? */
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
+
+ idx = l - ip_vs_conn_tab;
+ while (++idx < ip_vs_conn_tab_size) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ cond_resched_rcu();
+ }
+ iter->l = NULL;
+ return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->pe->show_pe_data(cp, pe_data + len);
+ }
+ pe_data[len] = '\0';
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%pI6 %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ &cp->daddr.in6, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X"
+ " %08X %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ ntohl(cp->daddr.ip), ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const char *ip_vs_origin_name(unsigned int flags)
+{
+ if (flags & IP_VS_CONN_F_SYNC)
+ return "SYNC";
+ else
+ return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ &cp->daddr.in6, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X "
+ "%08X %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ ntohl(cp->daddr.ip), ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_sync_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+
+/*
+ * Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+ /*
+ * The drop rate array needs tuning for real environments.
+ * Called from timer bh only => no locking
+ */
+ static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static char todrop_counter[9] = {0};
+ int i;
+
+ /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+ This will leave enough time for normal connection to get
+ through. */
+ if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+ return 0;
+
+ /* Don't drop the entry if its number of incoming packets is not
+ located in [0, 8] */
+ i = atomic_read(&cp->in_pkts);
+ if (i > 8 || i < 0) return 0;
+
+ if (!todrop_rate[i]) return 0;
+ if (--todrop_counter[i] > 0) return 0;
+
+ todrop_counter[i] = todrop_rate[i];
+ return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+
+ rcu_read_lock();
+ /*
+ * Randomly scan 1/32 of the whole table every second
+ */
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ /* connection template */
+ continue;
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ if (cp->protocol == IPPROTO_TCP) {
+ switch(cp->state) {
+ case IP_VS_TCP_S_SYN_RECV:
+ case IP_VS_TCP_S_SYNACK:
+ break;
+
+ case IP_VS_TCP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+
+ default:
+ continue;
+ }
+ } else if (cp->protocol == IPPROTO_SCTP) {
+ switch (cp->state) {
+ case IP_VS_SCTP_S_INIT1:
+ case IP_VS_SCTP_S_INIT:
+ break;
+ case IP_VS_SCTP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+ default:
+ continue;
+ }
+ } else {
+ if (!todrop_entry(cp))
+ continue;
+ }
+
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+flush_again:
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+
+ /* the counter may be not NULL, because maybe some conn entries
+ are run by slow timer handler or unhashed but still referred */
+ if (atomic_read(&ipvs->conn_count) != 0) {
+ schedule();
+ goto flush_again;
+ }
+}
+/*
+ * per netns init and exit
+ */
+int __net_init ip_vs_conn_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ remove_proc_entry("ip_vs_conn", net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+}
+
+int __init ip_vs_conn_init(void)
+{
+ int idx;
+
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
+ /*
+ * Allocate the connection hash table and initialize its list heads
+ */
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
+ if (!ip_vs_conn_tab)
+ return -ENOMEM;
+
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+ }
+
+ pr_info("Connection hash table configured "
+ "(size=%d, memory=%ldKbytes)\n",
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+ sizeof(struct ip_vs_conn));
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
+
+ for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ }
+
+ /* calculate the random value for connection hash */
+ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+ return 0;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
+ /* Release the empty cache */
+ kmem_cache_destroy(ip_vs_conn_cachep);
+ vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
new file mode 100644
index 00000000000..e6836755c45
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,2105 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ * Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sctp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h>
+#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+static int ip_vs_net_id __read_mostly;
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) (((icmph)->un).echo.id)
+#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned int proto)
+{
+ static char buf[20];
+
+ switch (proto) {
+ case IPPROTO_IP:
+ return "IP";
+ case IPPROTO_UDP:
+ return "UDP";
+ case IPPROTO_TCP:
+ return "TCP";
+ case IPPROTO_SCTP:
+ return "SCTP";
+ case IPPROTO_ICMP:
+ return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+ case IPPROTO_ICMPV6:
+ return "ICMPv6";
+#endif
+ default:
+ sprintf(buf, "IP_%u", proto);
+ return buf;
+ }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+ while (--rows >= 0)
+ INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ s->ustats.conns++;
+
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.conns++;
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.conns++;
+}
+
+
+static inline void
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (likely(pd->pp->state_transition))
+ pd->pp->state_transition(cp, direction, skb, pd);
+}
+
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
+ p->pe = rcu_dereference(svc->pe);
+ if (p->pe && p->pe->fill_param)
+ return p->pe->fill_param(p, skb);
+
+ return 0;
+}
+
+/*
+ * IPVS persistent scheduling function
+ * It creates a connection entry according to its template if exists,
+ * or selects a server and creates a connection entry plus a template.
+ * Locking: we are svc user (svc->refcnt), so we hold all dests too
+ * Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+ struct sk_buff *skb, __be16 src_port, __be16 dst_port,
+ int *ignored, struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *ct;
+ __be16 dport = 0; /* destination port to forward */
+ unsigned int flags;
+ struct ip_vs_conn_param param;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ union nf_inet_addr snet; /* source network of the client,
+ after masking */
+
+ /* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
+ else
+#endif
+ snet.ip = iph->saddr.ip & svc->netmask;
+
+ IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+ "mnet %s\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, &snet));
+
+ /*
+ * As far as we know, FTP is a very complicated network protocol, and
+ * it uses control connection and data connections. For active FTP,
+ * FTP server initialize data connection to the client, its source port
+ * is often 20. For passive FTP, FTP server tells the clients the port
+ * that it passively listens to, and the client issues the data
+ * connection. In the tunneling or direct routing mode, the load
+ * balancer is on the client-to-server half of connection, the port
+ * number is unknown to the load balancer. So, a conn template like
+ * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+ * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+ * is created for other persistent services.
+ */
+ {
+ int protocol = iph->protocol;
+ const union nf_inet_addr *vaddr = &iph->daddr;
+ __be16 vport = 0;
+
+ if (dst_port == svc->port) {
+ /* non-FTP template:
+ * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+ * FTP template:
+ * <protocol, caddr, 0, vaddr, 0, daddr, 0>
+ */
+ if (svc->port != FTPPORT)
+ vport = dst_port;
+ } else {
+ /* Note: persistent fwmark-based services and
+ * persistent port zero service are handled here.
+ * fwmark template:
+ * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template:
+ * <protocol,caddr,0,vaddr,0,daddr,0>
+ */
+ if (svc->fwmark) {
+ protocol = IPPROTO_IP;
+ vaddr = &fwmark;
+ }
+ }
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ /* Check if a template already exists */
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * No template found or the dest of the connection
+ * template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
+ */
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
+ *ignored = 0;
+ return NULL;
+ }
+
+ if (dst_port == svc->port && svc->port != FTPPORT)
+ dport = dest->port;
+
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
+ ct = ip_vs_conn_new(&param, &dest->addr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
+ if (ct == NULL) {
+ kfree(param.pe_data);
+ *ignored = -1;
+ return NULL;
+ }
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ kfree(param.pe_data);
+ }
+
+ dport = dst_port;
+ if (dport == svc->port && dest->port)
+ dport = dest->port;
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a new connection according to the template
+ */
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
+ src_port, &iph->daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
+ if (cp == NULL) {
+ ip_vs_conn_put(ct);
+ *ignored = -1;
+ return NULL;
+ }
+
+ /*
+ * Add its control
+ */
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * IPVS main scheduling function
+ * It selects a server according to the virtual service, and
+ * creates a connection entry.
+ * Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, int *ignored,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_dest *dest;
+ __be16 _ports[2], *pptr;
+ unsigned int flags;
+
+ *ignored = 1;
+ /*
+ * IPv6 frags, only the first hit here.
+ */
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return NULL;
+
+ /*
+ * FTPDATA needs this check when using local real server.
+ * Never schedule Active FTPDATA connections from real server.
+ * For LVS-NAT they must be already created. For other methods
+ * with persistence the connection is created on SYN+ACK.
+ */
+ if (pptr[0] == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling FTPDATA");
+ return NULL;
+ }
+
+ /*
+ * Do not schedule replies from local real server.
+ */
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling reply for existing connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
+
+ /*
+ * Persistent service
+ */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ iph);
+
+ *ignored = 0;
+
+ /*
+ * Non-persistent service
+ */
+ if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->port)
+ pr_err("Schedule: port zero only supported "
+ "in persistent services, "
+ "check your ipvs configuration\n");
+ return NULL;
+ }
+
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "Schedule: no dest found.\n");
+ return NULL;
+ }
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a connection entry.
+ */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0], &iph->daddr,
+ pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &dest->addr,
+ dest->port ? dest->port : pptr[1],
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * Pass or drop the packet.
+ * Called by ip_vs_in, when the virtual service is available but
+ * no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *pptr;
+#ifdef CONFIG_SYSCTL
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ int unicast;
+#endif
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL) {
+ return NF_DROP;
+ }
+
+#ifdef CONFIG_SYSCTL
+ net = skb_net(skb);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
+ else
+#endif
+ unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
+
+ /* if it is fwmark-based service, the cache_bypass sysctl is up
+ and the destination is a non-local unicast, then create
+ a cache_bypass connection entry */
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ int ret;
+ struct ip_vs_conn *cp;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+ union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
+
+ /* create a new connection entry */
+ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &daddr, 0,
+ IP_VS_CONN_F_BYPASS | flags,
+ NULL, skb->mark);
+ if (!cp)
+ return NF_DROP;
+ }
+
+ /* statistics */
+ ip_vs_in_stats(cp, skb);
+
+ /* set state */
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+
+ /* transmit the first SYN packet */
+ ret = cp->packet_xmit(skb, cp, pd->pp, iph);
+ /* do not touch skb anymore */
+
+ atomic_inc(&cp->in_pkts);
+ ip_vs_conn_put(cp);
+ return ret;
+ }
+#endif
+
+ /*
+ * When the virtual ftp service is presented, packets destined
+ * for other services on the VIP may get here (except services
+ * listed in the ipvs table), pass the packets, because it is
+ * not ipvs job to decide to drop the packets.
+ */
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
+ return NF_ACCEPT;
+
+ /*
+ * Notify the client that the destination is unreachable, and
+ * release the socket buffer.
+ * Since it is in IP layer, the TCP socket is not actually
+ * created, the TCP RST packet cannot be sent, instead that
+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+ */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6) {
+ if (!skb->dev) {
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net_->loopback_dev;
+ }
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+ } else
+#endif
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ return NF_DROP;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+ if (NF_INET_LOCAL_IN == hooknum)
+ return IP_DEFRAG_VS_IN;
+ if (NF_INET_FORWARD == hooknum)
+ return IP_DEFRAG_VS_FWD;
+ return IP_DEFRAG_VS_OUT;
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ int err;
+
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
+ if (!err)
+ ip_send_check(ip_hdr(skb));
+
+ return err;
+}
+
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
+ return 1;
+ } else
+#endif
+ if ((sysctl_snat_reroute(skb) ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int icmp_offset = iph->ihl*4;
+ struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
+ icmp_offset);
+ struct iphdr *ciph = (struct iphdr *)(icmph + 1);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.ip;
+ ip_send_check(iph);
+ ciph->daddr = cp->vaddr.ip;
+ ip_send_check(ciph);
+ } else {
+ iph->daddr = cp->daddr.ip;
+ ip_send_check(iph);
+ ciph->saddr = cp->daddr.ip;
+ ip_send_check(ciph);
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
+ IPPROTO_SCTP == ciph->protocol) {
+ __be16 *ports = (void *)ciph + ciph->ihl*4;
+
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMP");
+ else
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ unsigned int icmp_offset = 0;
+ unsigned int offs = 0; /* header offset*/
+ int protocol;
+ struct icmp6hdr *icmph;
+ struct ipv6hdr *ciph;
+ unsigned short fragoffs;
+
+ ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
+ icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
+ offs = icmp_offset + sizeof(struct icmp6hdr);
+ ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
+
+ protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.in6;
+ ciph->daddr = cp->vaddr.in6;
+ } else {
+ iph->daddr = cp->daddr.in6;
+ ciph->saddr = cp->daddr.in6;
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)) {
+ __be16 *ports = (void *)(skb_network_header(skb) + offs);
+
+ IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
+ ntohs(inout ? ports[1] : ports[0]),
+ ntohs(inout ? cp->vport : cp->dport));
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+ skb->len - icmp_offset,
+ IPPROTO_ICMPV6, 0);
+ skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+ skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMPv6");
+ else
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+ union nf_inet_addr *snet,
+ __u8 protocol, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp,
+ unsigned int offset, unsigned int ihl)
+{
+ unsigned int verdict = NF_DROP;
+
+ if (IP_VS_FWD_METHOD(cp) != 0) {
+ pr_err("shouldn't reach here, because the box is on the "
+ "half connection in the tun/dr module.\n");
+ }
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+ IP_VS_DBG_ADDR(af, snet));
+ goto out;
+ }
+
+ if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)
+ offset += 2 * sizeof(__u16);
+ if (!skb_make_writable(skb, offset))
+ goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+ else
+#endif
+ ip_vs_nat_icmp(skb, pp, cp, 1);
+
+ if (ip_vs_route_me_harder(af, skb))
+ goto out;
+
+ /* do the statistics and put it back */
+ ip_vs_out_stats(cp, skb);
+
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ verdict = NF_ACCEPT;
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+/*
+ * Handle ICMP messages in the inside-to-outside direction (outgoing).
+ * Find any that might be relevant, check against existing connections.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
+{
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ unsigned int offset, ihl;
+ union nf_inet_addr snet;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ pp = ip_vs_proto_get(cih->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking outgoing ICMP for");
+
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.ip = iph->saddr;
+ return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+ pp, ciph.len, ihl);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
+{
+ struct icmp6hdr _icmph, *ic;
+ struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ union nf_inet_addr snet;
+ unsigned int writable;
+
+ *related = 1;
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &ipvsh->saddr, &ipvsh->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = ipvsh->len + sizeof(_icmph);
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ pp = ip_vs_proto_get(ciph.protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.in6 = ciph.saddr.in6;
+ writable = ciph.len;
+ return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
+ pp, writable, sizeof(struct ipv6hdr));
+}
+#endif
+
+/*
+ * Check if sctp chunc is ABORT chunk
+ */
+static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
+{
+ sctp_chunkhdr_t *sch, schunk;
+ sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return 0;
+ if (sch->type == SCTP_CID_ABORT)
+ return 1;
+ return 0;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+ return th->rst;
+}
+
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+
+ if (!skb_make_writable(skb, iph->len))
+ goto drop;
+
+ /* mangle the packet */
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
+ goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+ else
+#endif
+ {
+ ip_hdr(skb)->saddr = cp->vaddr.ip;
+ ip_send_check(ip_hdr(skb));
+ }
+
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
+ /* For policy routing, packets originating from this
+ * machine itself may be routed differently to packets
+ * passing through. We want this packet to be routed as
+ * if it came from this machine itself. So re-compute
+ * the routing information.
+ */
+ if (ip_vs_route_me_harder(af, skb))
+ goto drop;
+
+ IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+
+ ip_vs_out_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ ip_vs_conn_put(cp);
+
+ LeaveFunction(11);
+ return NF_ACCEPT;
+
+drop:
+ ip_vs_conn_put(cp);
+ kfree_skb(skb);
+ LeaveFunction(11);
+ return NF_STOLEN;
+}
+
+/*
+ * Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net = NULL;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+
+ EnterFunction(11);
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(!skb_dst(skb)))
+ return NF_ACCEPT;
+
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_out_icmp_v6(skb, &related,
+ hooknum, &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET)
+#endif
+ if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
+ if (ip_vs_gather_frags(skb,
+ ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+
+ ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
+ }
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(af, skb, &iph, 0);
+
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph);
+ if (sysctl_nat_icmp_send(net) &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP ||
+ pp->protocol == IPPROTO_SCTP)) {
+ __be16 _ports[2], *pptr;
+
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if ((iph.protocol != IPPROTO_TCP &&
+ iph.protocol != IPPROTO_SCTP)
+ || ((iph.protocol == IPPROTO_TCP
+ && !is_tcp_reset(skb, iph.len))
+ || (iph.protocol == IPPROTO_SCTP
+ && !is_sctp_abort(skb,
+ iph.len)))) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ icmpv6_send(skb,
+ ICMPV6_DEST_UNREACH,
+ ICMPV6_PORT_UNREACH,
+ 0);
+ } else
+#endif
+ icmp_send(skb,
+ ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
+ }
+ }
+ }
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_out: packet continues traversal as normal");
+ return NF_ACCEPT;
+}
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+/*
+ * Handle ICMP messages in the outside-to-inside direction (incoming).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+ struct net *net = NULL;
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offset, offset2, ihl, verdict;
+ bool ipip;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ net = skb_net(skb);
+
+ /* Special case for errors for IPIP packets */
+ ipip = false;
+ if (cih->protocol == IPPROTO_IPIP) {
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ /* Error for our IPIP must arrive at LOCAL_IN */
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
+ return NF_ACCEPT;
+ offset += cih->ihl * 4;
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ipip = true;
+ }
+
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking incoming ICMP for");
+
+ offset2 = offset;
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ offset = ciph.len;
+ /* The embedded headers contain source and dest in reverse order.
+ * For IPIP this is error for request, not for reply.
+ */
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ verdict = NF_DROP;
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
+ &iph->saddr);
+ goto out;
+ }
+
+ if (ipip) {
+ __be32 info = ic->un.gateway;
+ __u8 type = ic->type;
+ __u8 code = ic->code;
+
+ /* Update the MTU */
+ if (ic->type == ICMP_DEST_UNREACH &&
+ ic->code == ICMP_FRAG_NEEDED) {
+ struct ip_vs_dest *dest = cp->dest;
+ u32 mtu = ntohs(ic->un.frag.mtu);
+ __be16 frag_off = cih->frag_off;
+
+ /* Strip outer IP and ICMP, go to IPIP header */
+ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
+ goto ignore_ipip;
+ offset2 -= ihl + sizeof(_icmph);
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ ipv4_update_pmtu(skb, dev_net(skb->dev),
+ mtu, 0, 0, 0, 0);
+ /* Client uses PMTUD? */
+ if (!(frag_off & htons(IP_DF)))
+ goto ignore_ipip;
+ /* Prefer the resulting PMTU */
+ if (dest) {
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
+ }
+ if (mtu > 68 + sizeof(struct iphdr))
+ mtu -= sizeof(struct iphdr);
+ info = htonl(mtu);
+ }
+ /* Strip outer IP, ICMP and IPIP, go to IP header of
+ * original request.
+ */
+ if (pskb_pull(skb, offset2) == NULL)
+ goto ignore_ipip;
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ type, code, ntohl(info));
+ icmp_send(skb, type, code, info);
+ /* ICMP can be shorter but anyways, account it */
+ ip_vs_out_stats(cp, skb);
+
+ignore_ipip:
+ consume_skb(skb);
+ verdict = NF_STOLEN;
+ goto out;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+ IPPROTO_SCTP == cih->protocol)
+ offset += 2 * sizeof(__u16);
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *iph)
+{
+ struct net *net = NULL;
+ struct ipv6hdr _ip6h, *ip6h;
+ struct icmp6hdr _icmph, *ic;
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offs_ciph, writable, verdict;
+
+ *related = 1;
+
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = iph->len + sizeof(_icmph);
+ offs_ciph = ciph.len; /* Save ip header offset */
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, ciph.protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Cannot handle fragmented embedded protocol */
+ if (ciph.fragoffs)
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
+ "Checking incoming ICMPv6 for");
+
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+
+ if (!cp)
+ return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
+ return NF_ACCEPT;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+
+ /* Need to mangle contained IPv6 header in ICMPv6 packet */
+ writable = ciph.len;
+ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
+ IPPROTO_SCTP == ciph.protocol)
+ writable += 2 * sizeof(__u16); /* Also mangle ports */
+
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
+
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+#endif
+
+
+/*
+ * Check if it's for virtual services, look it up,
+ * and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+ int ret, pkts;
+ struct netns_ipvs *ipvs;
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /*
+ * Big tappo:
+ * - remote client: only PACKET_HOST
+ * - route: used for struct net when skb->dev is unset
+ */
+ if (unlikely((skb->pkt_type != PACKET_HOST &&
+ hooknum != NF_INET_LOCAL_OUT) ||
+ !skb_dst(skb))) {
+ ip_vs_fill_iph_skb(af, skb, &iph);
+ IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+ " ignored in hook %u\n",
+ skb->pkt_type, iph.protocol,
+ IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
+ return NF_ACCEPT;
+ }
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+ &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ /* Protocol supported? */
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+ /*
+ * Check if the packet belongs to an existing connection entry
+ */
+ cp = pp->conn_in_get(af, skb, &iph, 0);
+
+ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
+ is_new_conn(skb, &iph)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
+ if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
+ int v;
+
+ /* Schedule and create new connection entry into &cp */
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+ return v;
+ }
+
+ if (unlikely(!cp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
+ /* Check the server status */
+ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ /* the destination server is not available */
+
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ /* try to expire the connection immediately */
+ ip_vs_conn_expire_now(cp);
+ }
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
+ }
+
+ ip_vs_in_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+ if (cp->packet_xmit)
+ ret = cp->packet_xmit(skb, cp, pp, &iph);
+ /* do not touch skb anymore */
+ else {
+ IP_VS_DBG_RL("warning: packet_xmit is null");
+ ret = NF_ACCEPT;
+ }
+
+ /* Increase its packet counter and check if it is needed
+ * to be synchronized
+ *
+ * Sync connection if it is about to close to
+ * encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
+ */
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = sysctl_sync_threshold(ipvs);
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
+
+ ip_vs_conn_put(cp);
+ return ret;
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+
+/*
+ * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ * related packets destined for 0.0.0.0/0.
+ * When fwmark-based virtual service is used, such as transparent
+ * cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ * and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp(skb, &r, ops->hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ struct ip_vs_iphdr iphdr;
+
+ ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#ifdef CONFIG_IP_VS_IPV6
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp_v6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#endif
+};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL)
+ return -ENOMEM;
+
+ /* Hold the beast until a service is registerd */
+ ipvs->enable = 0;
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+
+ if (ip_vs_estimator_net_init(net) < 0)
+ goto estimator_fail;
+
+ if (ip_vs_control_net_init(net) < 0)
+ goto control_fail;
+
+ if (ip_vs_protocol_net_init(net) < 0)
+ goto protocol_fail;
+
+ if (ip_vs_app_net_init(net) < 0)
+ goto app_fail;
+
+ if (ip_vs_conn_net_init(net) < 0)
+ goto conn_fail;
+
+ if (ip_vs_sync_net_init(net) < 0)
+ goto sync_fail;
+
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+ ip_vs_conn_net_cleanup(net);
+conn_fail:
+ ip_vs_app_net_cleanup(net);
+app_fail:
+ ip_vs_protocol_net_cleanup(net);
+protocol_fail:
+ ip_vs_control_net_cleanup(net);
+control_fail:
+ ip_vs_estimator_net_cleanup(net);
+estimator_fail:
+ net->ipvs = NULL;
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(net);
+ ip_vs_app_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
+ ip_vs_control_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
+ IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ net->ipvs = NULL;
+}
+
+static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ net_ipvs(net)->enable = 0; /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(net);
+ LeaveFunction(2);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
+
+static struct pernet_operations ipvs_core_dev_ops = {
+ .exit = __ip_vs_dev_cleanup,
+};
+
+/*
+ * Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+ int ret;
+
+ ret = ip_vs_control_init();
+ if (ret < 0) {
+ pr_err("can't setup control.\n");
+ goto exit;
+ }
+
+ ip_vs_protocol_init();
+
+ ret = ip_vs_conn_init();
+ if (ret < 0) {
+ pr_err("can't setup connection table.\n");
+ goto cleanup_protocol;
+ }
+
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ goto cleanup_conn;
+
+ ret = register_pernet_device(&ipvs_core_dev_ops);
+ if (ret < 0)
+ goto cleanup_sub;
+
+ ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ if (ret < 0) {
+ pr_err("can't register hooks.\n");
+ goto cleanup_dev;
+ }
+
+ ret = ip_vs_register_nl_ioctl();
+ if (ret < 0) {
+ pr_err("can't register netlink/ioctl.\n");
+ goto cleanup_hooks;
+ }
+
+ pr_info("ipvs loaded.\n");
+
+ return ret;
+
+cleanup_hooks:
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+cleanup_dev:
+ unregister_pernet_device(&ipvs_core_dev_ops);
+cleanup_sub:
+ unregister_pernet_subsys(&ipvs_core_ops);
+cleanup_conn:
+ ip_vs_conn_cleanup();
+cleanup_protocol:
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+exit:
+ return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+ ip_vs_unregister_nl_ioctl();
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ unregister_pernet_device(&ipvs_core_dev_ops);
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
+ ip_vs_conn_cleanup();
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+ pr_info("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
new file mode 100644
index 00000000000..581a6584ed0
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3909 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* sysctl variables */
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+ return sysctl_ip_vs_debug_level;
+}
+#endif
+
+
+/* Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
+
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
+{
+ struct flowi6 fl6 = {
+ .daddr = *addr,
+ };
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
+
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
+
+ dst_release(dst);
+ return is_local;
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+/*
+ * update_defense_level is called from keventd and from sysctl,
+ * so it needs to protect itself from softirqs
+ */
+static void update_defense_level(struct netns_ipvs *ipvs)
+{
+ struct sysinfo i;
+ static int old_secure_tcp = 0;
+ int availmem;
+ int nomem;
+ int to_change = -1;
+
+ /* we only count free and buffered memory (in pages) */
+ si_meminfo(&i);
+ availmem = i.freeram + i.bufferram;
+ /* however in linux 2.5 the i.bufferram is total page cache size,
+ we need adjust it */
+ /* si_swapinfo(&i); */
+ /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+ nomem = (availmem < ipvs->sysctl_amemthresh);
+
+ local_bh_disable();
+
+ /* drop_entry */
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
+ case 0:
+ atomic_set(&ipvs->dropentry, 0);
+ break;
+ case 1:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ }
+ break;
+ case 2:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
+ };
+ break;
+ case 3:
+ atomic_set(&ipvs->dropentry, 1);
+ break;
+ }
+ spin_unlock(&ipvs->dropentry_lock);
+
+ /* drop_packet */
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
+ case 0:
+ ipvs->drop_rate = 0;
+ break;
+ case 1:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
+ } else {
+ ipvs->drop_rate = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ } else {
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
+ }
+ break;
+ case 3:
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
+ break;
+ }
+ spin_unlock(&ipvs->droppacket_lock);
+
+ /* secure_tcp */
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
+ case 0:
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ break;
+ case 1:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ ipvs->sysctl_secure_tcp = 2;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ ipvs->sysctl_secure_tcp = 1;
+ }
+ break;
+ case 3:
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ break;
+ }
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
+ if (to_change >= 0)
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
+
+ local_bh_enable();
+}
+
+
+/*
+ * Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD 1*HZ
+
+static void defense_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
+
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+}
+#endif
+
+int
+ip_vs_use_count_inc(void)
+{
+ return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+ module_put(THIS_MODULE);
+}
+
+
+/*
+ * Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+
+/*
+ * Returns hash value for virtual service
+ */
+static inline unsigned int
+ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+ __u32 ahash;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
+
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Returns hash value of fwmark for virtual service lookup
+ */
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+{
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
+ * or in the ip_vs_svc_fwm_table by fwmark.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+ unsigned int hash;
+
+ if (svc->flags & IP_VS_SVC_F_HASHED) {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /*
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
+ */
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
+ } else {
+ /*
+ * Hash it by fwmark in svc_fwm_table
+ */
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ }
+
+ svc->flags |= IP_VS_SVC_F_HASHED;
+ /* increase its refcnt because it is referenced by the svc table */
+ atomic_inc(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Unhashes a service from svc_table / svc_fwm_table.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+ pr_err("%s(): request for unhash flagged, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /* Remove it from the svc_table table */
+ hlist_del_rcu(&svc->s_list);
+ } else {
+ /* Remove it from the svc_fwm_table table */
+ hlist_del_rcu(&svc->f_list);
+ }
+
+ svc->flags &= ~IP_VS_SVC_F_HASHED;
+ atomic_dec(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Get service by {netns, proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
+ if ((svc->af == af)
+ && ip_vs_addr_equal(af, &svc->addr, vaddr)
+ && (svc->port == vport)
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for fwmark addressed entries */
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find service, called under RCU lock */
+struct ip_vs_service *
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * Check the table hashed by fwmark first
+ */
+ if (fwmark) {
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (svc)
+ goto out;
+ }
+
+ /*
+ * Check the table hashed by <protocol,addr,port>
+ * for "full" addressed entries
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+
+ if (svc == NULL
+ && protocol == IPPROTO_TCP
+ && atomic_read(&ipvs->ftpsvc_counter)
+ && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ /*
+ * Check if ftp service entry exists, the packet
+ * might belong to FTP data connections.
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+ }
+
+ if (svc == NULL
+ && atomic_read(&ipvs->nullsvc_counter)) {
+ /*
+ * Check if the catch-all port (port zero) exists
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+ }
+
+ out:
+ IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+ fwmark, ip_vs_proto_name(protocol),
+ IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+ svc ? "hit" : "not hit");
+
+ return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ atomic_inc(&svc->refcnt);
+ rcu_assign_pointer(dest->svc, svc);
+}
+
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
+
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
+ }
+}
+
+
+/*
+ * Returns hash value for real service
+ */
+static inline unsigned int ip_vs_rs_hashkey(int af,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+
+ return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+ & IP_VS_RTAB_MASK;
+}
+
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+{
+ unsigned int hash;
+
+ if (dest->in_rs_table)
+ return;
+
+ /*
+ * Hash by proto,addr,port,
+ * which are the parameters of the real service.
+ */
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
+}
+
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+ /*
+ * Remove it from the rs_table table.
+ */
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
+ }
+}
+
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+ __be16 dport)
+{
+ struct ip_vs_dest *dest;
+
+ /*
+ * Find the destination for the given service
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->af == svc->af)
+ && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
+ && (dest->port == dport)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Created to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ * Called under RCU lock, no refcnt is returned.
+ */
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
+ const union nf_inet_addr *daddr,
+ __be16 dport,
+ const union nf_inet_addr *vaddr,
+ __be16 vport, __u16 protocol, __u32 fwmark,
+ __u32 flags)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_service *svc;
+ __be16 port = dport;
+
+ svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
+ if (!svc)
+ return NULL;
+ if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+ port = 0;
+ dest = ip_vs_lookup_dest(svc, daddr, port);
+ if (!dest)
+ dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
+ return dest;
+}
+
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
+/*
+ * Lookup dest by {svc,addr,port} in the destination trash.
+ * The destination trash is used to hold the destinations that are removed
+ * from the service table but are still referenced by some conn entries.
+ * The reason to add the destination trash is when the dest is temporary
+ * down (either by administrator or by monitor program), the dest can be
+ * picked back from the trash, the remaining connections to the dest can
+ * continue, and the counting information of the dest is also useful for
+ * scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+ __be16 dport)
+{
+ struct ip_vs_dest *dest;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ /*
+ * Find the destination in trash
+ */
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+ "dest->refcnt=%d\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (dest->af == svc->af &&
+ ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
+ dest->port == dport &&
+ dest->vfwmark == svc->fwmark &&
+ dest->protocol == svc->protocol &&
+ (svc->fwmark ||
+ (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+ dest->vport == svc->port))) {
+ /* HIT */
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
+ }
+ }
+
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
+}
+
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_svc_put(svc, false);
+ free_percpu(dest->stats.cpustats);
+ ip_vs_dest_put_and_free(dest);
+}
+
+/*
+ * Clean up all the destinations in the trash
+ * Called by the ip_vs_control_cleanup()
+ *
+ * When the ip_vs_control_clearup is activated by ipvs module exit,
+ * the service tables must have been flushed and all the connections
+ * are expired, and the refcnt of each destination in the trash must
+ * be 0, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(struct net *net)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+}
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
+
+ spin_lock_bh(&src->lock);
+
+ IP_VS_SHOW_STATS_COUNTER(conns);
+ IP_VS_SHOW_STATS_COUNTER(inpkts);
+ IP_VS_SHOW_STATS_COUNTER(outpkts);
+ IP_VS_SHOW_STATS_COUNTER(inbytes);
+ IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+ ip_vs_read_estimator(dst, src);
+
+ spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+ spin_lock_bh(&stats->lock);
+
+ /* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
+
+ IP_VS_ZERO_STATS_COUNTER(conns);
+ IP_VS_ZERO_STATS_COUNTER(inpkts);
+ IP_VS_ZERO_STATS_COUNTER(outpkts);
+ IP_VS_ZERO_STATS_COUNTER(inbytes);
+ IP_VS_ZERO_STATS_COUNTER(outbytes);
+
+ ip_vs_zero_estimator(stats);
+
+ spin_unlock_bh(&stats->lock);
+}
+
+/*
+ * Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
+ struct ip_vs_scheduler *sched;
+ int conn_flags;
+
+ /* set the weight and the flags */
+ atomic_set(&dest->weight, udest->weight);
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
+
+ /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
+ conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+ } else {
+ /*
+ * Put the real service in rs_table if not present.
+ * For now only for NAT!
+ */
+ ip_vs_rs_hash(ipvs, dest);
+ }
+ atomic_set(&dest->conn_flags, conn_flags);
+
+ /* bind the service */
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
+ __ip_vs_bind_svc(dest, svc);
+ } else {
+ if (old_svc != svc) {
+ ip_vs_zero_stats(&dest->stats);
+ __ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
+ }
+ }
+
+ /* set the dest status flags */
+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ dest->u_threshold = udest->u_threshold;
+ dest->l_threshold = udest->l_threshold;
+
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (add) {
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest **dest_p)
+{
+ struct ip_vs_dest *dest;
+ unsigned int atype, i;
+
+ EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6) {
+ atype = ipv6_addr_type(&udest->addr.in6);
+ if ((!(atype & IPV6_ADDR_UNICAST) ||
+ atype & IPV6_ADDR_LINKLOCAL) &&
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+ return -EINVAL;
+ } else
+#endif
+ {
+ atype = inet_addr_type(svc->net, udest->addr.ip);
+ if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+ return -EINVAL;
+ }
+
+ dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
+ if (dest == NULL)
+ return -ENOMEM;
+
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats)
+ goto err_alloc;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_dest_stats;
+ ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+ u64_stats_init(&ip_vs_dest_stats->syncp);
+ }
+
+ dest->af = svc->af;
+ dest->protocol = svc->protocol;
+ dest->vaddr = svc->addr;
+ dest->vport = svc->port;
+ dest->vfwmark = svc->fwmark;
+ ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
+ dest->port = udest->port;
+
+ atomic_set(&dest->activeconns, 0);
+ atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
+ atomic_set(&dest->refcnt, 1);
+
+ INIT_HLIST_NODE(&dest->d_list);
+ spin_lock_init(&dest->dst_lock);
+ spin_lock_init(&dest->stats.lock);
+ __ip_vs_update_dest(svc, dest, udest, 1);
+
+ *dest_p = dest;
+
+ LeaveFunction(2);
+ return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
+}
+
+
+/*
+ * Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+ int ret;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest != NULL) {
+ IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
+ return -EEXIST;
+ }
+
+ /*
+ * Check if the dest already exists in the trash and
+ * is from the same service
+ */
+ dest = ip_vs_trash_get_dest(svc, &daddr, dport);
+
+ if (dest != NULL) {
+ IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+ "dest->refcnt=%d, service %u/%s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
+ atomic_read(&dest->refcnt),
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+ ntohs(dest->vport));
+
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
+ /*
+ * Allocate and initialize the dest structure
+ */
+ ret = ip_vs_new_dest(svc, udest, &dest);
+ }
+ LeaveFunction(2);
+
+ return ret;
+}
+
+
+/*
+ * Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
+ return -ENOENT;
+ }
+
+ __ip_vs_update_dest(svc, dest, udest, 0);
+ LeaveFunction(2);
+
+ return 0;
+}
+
+/*
+ * Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_stop_estimator(net, &dest->stats);
+
+ /*
+ * Remove it from the d-linked list with the real services.
+ */
+ ip_vs_rs_unhash(dest);
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
+}
+
+
+/*
+ * Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ int svcupd)
+{
+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+ /*
+ * Remove it from the d-linked destination list.
+ */
+ list_del_rcu(&dest->n_list);
+ svc->num_dests--;
+
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
+ return -ENOENT;
+ }
+
+ /*
+ * Unlink dest from the service
+ */
+ __ip_vs_unlink_dest(svc, dest, 1);
+
+ /*
+ * Delete the destination
+ */
+ __ip_vs_del_dest(svc->net, dest, false);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ spin_unlock(&ipvs->dest_trash_lock);
+}
+
+/*
+ * Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+ struct ip_vs_service **svc_p)
+{
+ int ret = 0, i;
+ struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_pe *pe = NULL;
+ struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Lookup the scheduler by 'u->sched_name' */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ }
+#endif
+
+ svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
+ if (svc == NULL) {
+ IP_VS_DBG(1, "%s(): no memory\n", __func__);
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_stats;
+ ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+ u64_stats_init(&ip_vs_stats->syncp);
+ }
+
+
+ /* I'm the first user of the service */
+ atomic_set(&svc->refcnt, 0);
+
+ svc->af = u->af;
+ svc->protocol = u->protocol;
+ ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+ svc->port = u->port;
+ svc->fwmark = u->fwmark;
+ svc->flags = u->flags;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+ svc->net = net;
+
+ INIT_LIST_HEAD(&svc->destinations);
+ spin_lock_init(&svc->sched_lock);
+ spin_lock_init(&svc->stats.lock);
+
+ /* Bind the scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
+
+ /* Update the virtual service counters */
+ if (svc->port == FTPPORT)
+ atomic_inc(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_inc(&ipvs->nullsvc_counter);
+
+ ip_vs_start_estimator(net, &svc->stats);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services++;
+
+ /* Hash the service into the service table */
+ ip_vs_svc_hash(svc);
+
+ *svc_p = svc;
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
+ return 0;
+
+
+ out_err:
+ if (svc != NULL) {
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
+ }
+ ip_vs_scheduler_put(sched);
+ ip_vs_pe_put(pe);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+/*
+ * Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+ struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_pe *pe = NULL, *old_pe = NULL;
+ int ret = 0;
+
+ /*
+ * Lookup the scheduler, by 'u->sched_name'
+ */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ return -ENOENT;
+ }
+ old_sched = sched;
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out;
+ }
+ old_pe = pe;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+#endif
+
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
+
+ /*
+ * Set the flags and timeout value
+ */
+ svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
+
+out:
+ ip_vs_scheduler_put(old_sched);
+ ip_vs_pe_put(old_pe);
+ return ret;
+}
+
+/*
+ * Delete a service from the service list
+ * - The service must be unlinked, unlocked and not referenced!
+ * - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_scheduler *old_sched;
+ struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ pr_info("%s: enter\n", __func__);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services--;
+
+ ip_vs_stop_estimator(svc->net, &svc->stats);
+
+ /* Unbind scheduler */
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ ip_vs_pe_put(old_pe);
+
+ /*
+ * Unlink the whole destination list
+ */
+ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+ __ip_vs_unlink_dest(svc, dest, 0);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
+ }
+
+ /*
+ * Update the virtual service counters
+ */
+ if (svc->port == FTPPORT)
+ atomic_dec(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_dec(&ipvs->nullsvc_counter);
+
+ /*
+ * Free the service if nobody refers to it
+ */
+ __ip_vs_svc_put(svc, true);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+/*
+ * Unlink a service from list and try to delete it if its refcnt reached 0
+ */
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
+{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
+ /*
+ * Unhash it from the service table
+ */
+ ip_vs_svc_unhash(svc);
+
+ __ip_vs_del_service(svc, cleanup);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc, false);
+
+ return 0;
+}
+
+
+/*
+ * Flush all the virtual services
+ */
+static int ip_vs_flush(struct net *net, bool cleanup)
+{
+ int idx;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
+
+ /*
+ * Flush the service table hashed by <netns,protocol,addr,port>
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ /*
+ * Flush the service table hashed by fwmark
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Delete service by {netns} in the service table.
+ * Called by __ip_vs_cleanup()
+ */
+void ip_vs_service_net_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ /* Check for "full" addressed entries */
+ mutex_lock(&__ip_vs_mutex);
+ ip_vs_flush(net, true);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+}
+
+/* Put all references for device (dst_cache) */
+static inline void
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
+{
+ struct ip_vs_dest_dst *dest_dst;
+
+ spin_lock_bh(&dest->dst_lock);
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
+ IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
+ dev->name,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ __ip_vs_dst_cache_reset(dest);
+ }
+ spin_unlock_bh(&dest->dst_lock);
+
+}
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
+ */
+static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ unsigned int idx;
+
+ if (event != NETDEV_DOWN || !ipvs)
+ return NOTIFY_DONE;
+ IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ EnterFunction(2);
+ mutex_lock(&__ip_vs_mutex);
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+ }
+
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+
+ }
+ }
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ ip_vs_zero_stats(&dest->stats);
+ }
+ ip_vs_zero_stats(&svc->stats);
+ return 0;
+}
+
+static int ip_vs_zero_all(struct net *net)
+{
+ int idx;
+ struct ip_vs_service *svc;
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
+static int
+proc_do_defense_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 3)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ update_defense_level(net_ipvs(net));
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_threshold(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val[2];
+ int rc;
+
+ /* backup the value first */
+ memcpy(val, valp, sizeof(val));
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
+ /* Restore the correct value */
+ memcpy(valp, val, sizeof(val));
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+/*
+ * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in ip_vs_control_net_init()
+ */
+
+static struct ctl_table vs_vars[] = {
+ {
+ .procname = "amemthresh",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "am_droprate",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_entry",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "drop_packet",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .procname = "secure_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "snat_reroute",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_persist_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_sctp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "pmtu_disc",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#if 0
+ {
+ .procname = "timeout_established",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_synsent",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_synrecv",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_finwait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_timewait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_close",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_closewait",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_lastack",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_listen",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_synack",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_udp",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "timeout_icmp",
+ .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
+ { }
+};
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
+ struct hlist_head *table;
+ int bucket;
+};
+
+/*
+ * Write the contents of the VS rule table to a PROCfs file.
+ * (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned int flags)
+{
+ switch (flags & IP_VS_CONN_F_FWD_MASK) {
+ case IP_VS_CONN_F_LOCALNODE:
+ return "Local";
+ case IP_VS_CONN_F_TUNNEL:
+ return "Tunnel";
+ case IP_VS_CONN_F_DROUTE:
+ return "Route";
+ default:
+ return "Masq";
+ }
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_vs_iter *iter = seq->private;
+ int idx;
+ struct ip_vs_service *svc;
+
+ /* look in hash by protocol */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ /* keep looking in fwmark */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hlist_node *e;
+ struct ip_vs_iter *iter;
+ struct ip_vs_service *svc;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_info_array(seq,0);
+
+ svc = v;
+ iter = seq->private;
+
+ if (iter->table == ip_vs_svc_table) {
+ /* next service in table hashed by protocol */
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
+
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
+ return svc;
+ }
+ }
+
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = -1;
+ goto scan_fwmark;
+ }
+
+ /* next service in hashed by fwmark */
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
+ return svc;
+ }
+
+ return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "IP Virtual Server version %d.%d.%d (size=%d)\n",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ seq_puts(seq,
+ "Prot LocalAddress:Port Scheduler Flags\n");
+ seq_puts(seq,
+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+ } else {
+ const struct ip_vs_service *svc = v;
+ const struct ip_vs_iter *iter = seq->private;
+ const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+
+ if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ seq_printf(seq, "%s [%pI6]:%04X %s ",
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6,
+ ntohs(svc->port),
+ sched->name);
+ else
+#endif
+ seq_printf(seq, "%s %08X:%04X %s %s ",
+ ip_vs_proto_name(svc->protocol),
+ ntohl(svc->addr.ip),
+ ntohs(svc->port),
+ sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ } else {
+ seq_printf(seq, "FWM %08X %s %s",
+ svc->fwmark, sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ }
+
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ seq_printf(seq, "persistent %d %08X\n",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ seq_putc(seq, '\n');
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (dest->af == AF_INET6)
+ seq_printf(seq,
+ " -> [%pI6]:%04X"
+ " %-7s %-6d %-10d %-10d\n",
+ &dest->addr.in6,
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+ else
+#endif
+ seq_printf(seq,
+ " -> %08X:%04X "
+ "%-7s %-6d %-10d %-10d\n",
+ ntohl(dest->addr.ip),
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+
+ }
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+ .start = ip_vs_info_seq_start,
+ .next = ip_vs_info_seq_next,
+ .stop = ip_vs_info_seq_stop,
+ .show = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
+ sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats_user show;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ " Conns Packets Packets Bytes Bytes\n");
+
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
+ show.inpkts, show.outpkts,
+ (unsigned long long) show.inbytes,
+ (unsigned long long) show.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, "%8X %8X %8X %16X %16X\n",
+ show.cps, show.inpps, show.outpps,
+ show.inbps, show.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_show);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
+ struct ip_vs_stats_user rates;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&u->syncp);
+ inbytes = u->ustats.inbytes;
+ outbytes = u->ustats.outbytes;
+ } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+
+ seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+ i, u->ustats.conns, u->ustats.inpkts,
+ u->ustats.outpkts, (__u64)inbytes,
+ (__u64)outbytes);
+ }
+
+ spin_lock_bh(&tot_stats->lock);
+
+ seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
+ tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+ tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
+
+ ip_vs_read_estimator(&rates, tot_stats);
+
+ spin_unlock_bh(&tot_stats->lock);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8X %8X %8X %16X %16X\n",
+ rates.cps,
+ rates.inpps,
+ rates.outpps,
+ rates.inbps,
+ rates.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+#endif
+
+/*
+ * Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+ u->tcp_timeout,
+ u->tcp_fin_timeout,
+ u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ = u->tcp_timeout * HZ;
+ }
+
+ if (u->tcp_fin_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ = u->tcp_fin_timeout * HZ;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
+ = u->udp_timeout * HZ;
+ }
+#endif
+ return 0;
+}
+
+
+#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
+ sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN SVCDEST_ARG_LEN
+
+static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+ [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
+ [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
+};
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+ struct ip_vs_service_user *usvc_compat)
+{
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = AF_INET;
+ usvc->protocol = usvc_compat->protocol;
+ usvc->addr.ip = usvc_compat->addr;
+ usvc->port = usvc_compat->port;
+ usvc->fwmark = usvc_compat->fwmark;
+
+ /* Deep copy of sched_name is not needed here */
+ usvc->sched_name = usvc_compat->sched_name;
+
+ usvc->flags = usvc_compat->flags;
+ usvc->timeout = usvc_compat->timeout;
+ usvc->netmask = usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest_user *udest_compat)
+{
+ memset(udest, 0, sizeof(*udest));
+
+ udest->addr.ip = udest_compat->addr;
+ udest->port = udest_compat->port;
+ udest->conn_flags = udest_compat->conn_flags;
+ udest->weight = udest_compat->weight;
+ udest->u_threshold = udest_compat->u_threshold;
+ udest->l_threshold = udest_compat->l_threshold;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ struct net *net = sock_net(sk);
+ int ret;
+ unsigned char arg[MAX_ARG_LEN];
+ struct ip_vs_service_user *usvc_compat;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest_user *udest_compat;
+ struct ip_vs_dest_user_kern udest;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
+ return -EINVAL;
+ if (len < 0 || len > MAX_ARG_LEN)
+ return -EINVAL;
+ if (len != set_arglen[SET_CMDID(cmd)]) {
+ pr_err("set_ctl: len %u != %u\n",
+ len, set_arglen[SET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, len) != 0)
+ return -EFAULT;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Handle daemons since they have another lock */
+ if (cmd == IP_VS_SO_SET_STARTDAEMON ||
+ cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+
+ if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+ if (cmd == IP_VS_SO_SET_STARTDAEMON)
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
+ else
+ ret = stop_sync_thread(net, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ goto out_dec;
+ }
+
+ if (mutex_lock_interruptible(&__ip_vs_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+
+ if (cmd == IP_VS_SO_SET_FLUSH) {
+ /* Flush the virtual service */
+ ret = ip_vs_flush(net, false);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+ /* Set timeout values for (tcp tcpfin udp) */
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+ goto out_unlock;
+ }
+
+ usvc_compat = (struct ip_vs_service_user *)arg;
+ udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+ /* We only use the new structs internally, so copy userspace compat
+ * structs to extended internal versions */
+ ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+ ip_vs_copy_udest_compat(&udest, udest_compat);
+
+ if (cmd == IP_VS_SO_SET_ZERO) {
+ /* if no service address is set, zero counters in all */
+ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+ ret = ip_vs_zero_all(net);
+ goto out_unlock;
+ }
+ }
+
+ /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
+ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
+ usvc.protocol != IPPROTO_SCTP) {
+ pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
+ usvc.protocol, &usvc.addr.ip,
+ ntohs(usvc.port), usvc.sched_name);
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
+ if (usvc.fwmark == 0)
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
+ else
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
+
+ if (cmd != IP_VS_SO_SET_ADD
+ && (svc == NULL || svc->protocol != usvc.protocol)) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case IP_VS_SO_SET_ADD:
+ if (svc != NULL)
+ ret = -EEXIST;
+ else
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ break;
+ case IP_VS_SO_SET_EDIT:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IP_VS_SO_SET_DEL:
+ ret = ip_vs_del_service(svc);
+ if (!ret)
+ goto out_unlock;
+ break;
+ case IP_VS_SO_SET_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ case IP_VS_SO_SET_ADDDEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_EDITDEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_DELDEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ out_unlock:
+ mutex_unlock(&__ip_vs_mutex);
+ out_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
+ dst->protocol = src->protocol;
+ dst->addr = src->addr.ip;
+ dst->port = src->port;
+ dst->fwmark = src->fwmark;
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+ dst->flags = src->flags;
+ dst->timeout = src->timeout / HZ;
+ dst->netmask = src->netmask;
+ dst->num_dests = src->num_dests;
+ ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
+ struct ip_vs_get_services __user *uptr)
+{
+ int idx, count=0;
+ struct ip_vs_service *svc;
+ struct ip_vs_service_entry entry;
+ int ret = 0;
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+out:
+ return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+ struct ip_vs_get_dests __user *uptr)
+{
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr = { .ip = get->addr };
+ int ret = 0;
+
+ rcu_read_lock();
+ if (get->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+ get->port);
+ rcu_read_unlock();
+
+ if (svc) {
+ int count = 0;
+ struct ip_vs_dest *dest;
+ struct ip_vs_dest_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (count >= get->num_dests)
+ break;
+
+ entry.addr = dest->addr.ip;
+ entry.port = dest->port;
+ entry.conn_flags = atomic_read(&dest->conn_flags);
+ entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
+ entry.activeconns = atomic_read(&dest->activeconns);
+ entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
+ ip_vs_copy_stats(&entry.stats, &dest->stats);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ break;
+ }
+ count++;
+ }
+ } else
+ ret = -ESRCH;
+ return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ memset(u, 0, sizeof (*u));
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ u->udp_timeout =
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
+
+static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+ [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
+ [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ unsigned char arg[128];
+ int ret = 0;
+ unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ BUG_ON(!net);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
+ return -EINVAL;
+
+ if (*len < get_arglen[GET_CMDID(cmd)]) {
+ pr_err("get_ctl: len %u < %u\n",
+ *len, get_arglen[GET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ copylen = get_arglen[GET_CMDID(cmd)];
+ if (copylen > 128)
+ return -EINVAL;
+
+ if (copy_from_user(arg, user, copylen) != 0)
+ return -EFAULT;
+ /*
+ * Handle daemons first since it has its own locking
+ */
+ if (cmd == IP_VS_SO_GET_DAEMON) {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ if (mutex_lock_interruptible(&ipvs->sync_mutex))
+ return -ERESTARTSYS;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
+ }
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
+ }
+
+ if (mutex_lock_interruptible(&__ip_vs_mutex))
+ return -ERESTARTSYS;
+
+ switch (cmd) {
+ case IP_VS_SO_GET_VERSION:
+ {
+ char buf[64];
+
+ sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *len = strlen(buf)+1;
+ }
+ break;
+
+ case IP_VS_SO_GET_INFO:
+ {
+ struct ip_vs_getinfo info;
+ info.version = IP_VS_VERSION_CODE;
+ info.size = ip_vs_conn_tab_size;
+ info.num_services = ipvs->num_services;
+ if (copy_to_user(user, &info, sizeof(info)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICES:
+ {
+ struct ip_vs_get_services *get;
+ int size;
+
+ get = (struct ip_vs_get_services *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_service_entry) * get->num_services;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_service_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICE:
+ {
+ struct ip_vs_service_entry *entry;
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr;
+
+ entry = (struct ip_vs_service_entry *)arg;
+ addr.ip = entry->addr;
+ rcu_read_lock();
+ if (entry->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
+ rcu_read_unlock();
+ if (svc) {
+ ip_vs_copy_service(entry, svc);
+ if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+ ret = -EFAULT;
+ } else
+ ret = -ESRCH;
+ }
+ break;
+
+ case IP_VS_SO_GET_DESTS:
+ {
+ struct ip_vs_get_dests *get;
+ int size;
+
+ get = (struct ip_vs_get_dests *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_dest_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_TIMEOUT:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+ if (copy_to_user(user, &t, sizeof(t)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+ return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IP_VS_BASE_CTL,
+ .set_optmax = IP_VS_SO_SET_MAX+1,
+ .set = do_ip_vs_set_ctl,
+ .get_optmin = IP_VS_BASE_CTL,
+ .get_optmax = IP_VS_SO_GET_MAX+1,
+ .get = do_ip_vs_get_ctl,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = IPVS_GENL_NAME,
+ .version = IPVS_GENL_VERSION,
+ .maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+ [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+ [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_IFNAME_MAXLEN },
+ [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+ [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_SCHEDNAME_MAXLEN },
+ [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_PENAME_MAXLEN },
+ [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
+ .len = sizeof(struct ip_vs_flags) },
+ [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+ [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+ struct ip_vs_stats *stats)
+{
+ struct ip_vs_stats_user ustats;
+ struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+ if (!nl_stats)
+ return -EMSGSIZE;
+
+ ip_vs_copy_stats(&ustats, stats);
+
+ if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_stats);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_stats);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+ struct ip_vs_service *svc)
+{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
+ struct nlattr *nl_service;
+ struct ip_vs_flags flags = { .flags = svc->flags,
+ .mask = ~0 };
+
+ nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+ if (!nl_service)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
+ goto nla_put_failure;
+ if (svc->fwmark) {
+ if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
+ nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ goto nla_put_failure;
+ }
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
+ nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
+ nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_service);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_service);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+ struct ip_vs_service *svc,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_SERVICE);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_service(skb, svc) < 0)
+ goto nla_put_failure;
+
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0, i;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+nla_put_failure:
+ mutex_unlock(&__ip_vs_mutex);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
+{
+ struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+ struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
+
+ /* Parse mandatory identifying service fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+ return -EINVAL;
+
+ nla_af = attrs[IPVS_SVC_ATTR_AF];
+ nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
+ nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
+ nla_port = attrs[IPVS_SVC_ATTR_PORT];
+ nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
+
+ if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+ return -EINVAL;
+
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+ if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+ if (usvc->af != AF_INET)
+#endif
+ return -EAFNOSUPPORT;
+
+ if (nla_fwmark) {
+ usvc->protocol = IPPROTO_TCP;
+ usvc->fwmark = nla_get_u32(nla_fwmark);
+ } else {
+ usvc->protocol = nla_get_u16(nla_protocol);
+ nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+ usvc->port = nla_get_be16(nla_port);
+ usvc->fwmark = 0;
+ }
+
+ rcu_read_lock();
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ rcu_read_unlock();
+ *ret_svc = svc;
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
+ *nla_netmask;
+ struct ip_vs_flags flags;
+
+ nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+ nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
+ nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+ nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+ nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+ if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+ return -EINVAL;
+
+ nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+ /* prefill flags from service if it already exists */
+ if (svc)
+ usvc->flags = svc->flags;
+
+ /* set new flags from userland */
+ usvc->flags = (usvc->flags & ~flags.mask) |
+ (flags.flags & flags.mask);
+ usvc->sched_name = nla_data(nla_sched);
+ usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
+ usvc->timeout = nla_get_u32(nla_timeout);
+ usvc->netmask = nla_get_be32(nla_netmask);
+ }
+
+ return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
+{
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ int ret;
+
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+ struct nlattr *nl_dest;
+
+ nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+ if (!nl_dest)
+ return -EMSGSIZE;
+
+ if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+ (atomic_read(&dest->conn_flags) &
+ IP_VS_CONN_F_FWD_MASK)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
+ atomic_read(&dest->weight)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+ atomic_read(&dest->activeconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+ atomic_read(&dest->inactconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+ atomic_read(&dest->persistconns)))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_dest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_dest);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DEST);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_dest(skb, dest) < 0)
+ goto nla_put_failure;
+
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Try to find the service for which to dump destinations */
+ if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+ IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+ goto out_err;
+
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc) || svc == NULL)
+ goto out_err;
+
+ /* Dump the destinations */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (++idx <= start)
+ continue;
+ if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+
+nla_put_failure:
+ cb->args[0] = idx;
+
+out_err:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+ struct nlattr *nla, int full_entry)
+{
+ struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+ struct nlattr *nla_addr, *nla_port;
+
+ /* Parse mandatory identifying destination fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+ return -EINVAL;
+
+ nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
+ nla_port = attrs[IPVS_DEST_ATTR_PORT];
+
+ if (!(nla_addr && nla_port))
+ return -EINVAL;
+
+ memset(udest, 0, sizeof(*udest));
+
+ nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+ udest->port = nla_get_be16(nla_port);
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+ *nla_l_thresh;
+
+ nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
+ nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
+ nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
+ nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+
+ if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+ return -EINVAL;
+
+ udest->conn_flags = nla_get_u32(nla_fwd)
+ & IP_VS_CONN_F_FWD_MASK;
+ udest->weight = nla_get_u32(nla_weight);
+ udest->u_threshold = nla_get_u32(nla_u_thresh);
+ udest->l_threshold = nla_get_u32(nla_l_thresh);
+ }
+
+ return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
+{
+ struct nlattr *nl_daemon;
+
+ nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+ if (!nl_daemon)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_daemon);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_daemon);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DAEMON);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+ goto nla_put_failure;
+
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = skb_sknet(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[0] = 1;
+ }
+
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[1] = 1;
+ }
+
+nla_put_failure:
+ mutex_unlock(&ipvs->sync_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+ attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+ attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+ return -EINVAL;
+
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!attrs[IPVS_DAEMON_ATTR_STATE])
+ return -EINVAL;
+
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+{
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+ t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+ t.tcp_fin_timeout =
+ nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+ t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+ return ip_vs_set_timeout(net, &t);
+}
+
+static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0, cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
+ struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+ mutex_lock(&ipvs->sync_mutex);
+ if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+ nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+ info->attrs[IPVS_CMD_ATTR_DAEMON],
+ ip_vs_daemon_policy)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cmd == IPVS_CMD_NEW_DAEMON)
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+ else
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+out:
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+ return ret;
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ip_vs_service *svc = NULL;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_dest_user_kern udest;
+ int ret = 0, cmd;
+ int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ if (cmd == IPVS_CMD_FLUSH) {
+ ret = ip_vs_flush(net, false);
+ goto out;
+ } else if (cmd == IPVS_CMD_SET_CONFIG) {
+ ret = ip_vs_genl_set_config(net, info->attrs);
+ goto out;
+ } else if (cmd == IPVS_CMD_ZERO &&
+ !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+ ret = ip_vs_zero_all(net);
+ goto out;
+ }
+
+ /* All following commands require a service argument, so check if we
+ * received a valid one. We need a full service specification when
+ * adding / editing a service. Only identifying members otherwise. */
+ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+ need_full_svc = 1;
+
+ ret = ip_vs_genl_parse_service(net, &usvc,
+ info->attrs[IPVS_CMD_ATTR_SERVICE],
+ need_full_svc, &svc);
+ if (ret)
+ goto out;
+
+ /* Unless we're adding a new service, the service must already exist */
+ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ /* Destination commands require a valid destination argument. For
+ * adding / editing a destination, we need a full destination
+ * specification. */
+ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+ cmd == IPVS_CMD_DEL_DEST) {
+ if (cmd != IPVS_CMD_DEL_DEST)
+ need_full_dest = 1;
+
+ ret = ip_vs_genl_parse_dest(&udest,
+ info->attrs[IPVS_CMD_ATTR_DEST],
+ need_full_dest);
+ if (ret)
+ goto out;
+ }
+
+ switch (cmd) {
+ case IPVS_CMD_NEW_SERVICE:
+ if (svc == NULL)
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ else
+ ret = -EEXIST;
+ break;
+ case IPVS_CMD_SET_SERVICE:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IPVS_CMD_DEL_SERVICE:
+ ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
+ break;
+ case IPVS_CMD_NEW_DEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IPVS_CMD_SET_DEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IPVS_CMD_DEL_DEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ case IPVS_CMD_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *reply;
+ int ret, cmd, reply_cmd;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_GET_SERVICE)
+ reply_cmd = IPVS_CMD_NEW_SERVICE;
+ else if (cmd == IPVS_CMD_GET_INFO)
+ reply_cmd = IPVS_CMD_SET_INFO;
+ else if (cmd == IPVS_CMD_GET_CONFIG)
+ reply_cmd = IPVS_CMD_SET_CONFIG;
+ else {
+ pr_err("unknown Generic Netlink command\n");
+ return -EINVAL;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+ if (reply == NULL)
+ goto nla_put_failure;
+
+ switch (cmd) {
+ case IPVS_CMD_GET_SERVICE:
+ {
+ struct ip_vs_service *svc;
+
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc)) {
+ ret = PTR_ERR(svc);
+ goto out_err;
+ } else if (svc) {
+ ret = ip_vs_genl_fill_service(msg, svc);
+ if (ret)
+ goto nla_put_failure;
+ } else {
+ ret = -ESRCH;
+ goto out_err;
+ }
+
+ break;
+ }
+
+ case IPVS_CMD_GET_CONFIG:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
+ t.tcp_timeout) ||
+ nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+ t.tcp_fin_timeout))
+ goto nla_put_failure;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
+ goto nla_put_failure;
+#endif
+
+ break;
+ }
+
+ case IPVS_CMD_GET_INFO:
+ if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
+ IP_VS_VERSION_CODE) ||
+ nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+ ip_vs_conn_tab_size))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ goto out;
+
+nla_put_failure:
+ pr_err("not enough space in Netlink message\n");
+ ret = -EMSGSIZE;
+
+out_err:
+ nlmsg_free(msg);
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+
+static const struct genl_ops ip_vs_genl_ops[] = {
+ {
+ .cmd = IPVS_CMD_NEW_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ .dumpit = ip_vs_genl_dump_services,
+ .policy = ip_vs_cmd_policy,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .dumpit = ip_vs_genl_dump_dests,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .dumpit = ip_vs_genl_dump_daemons,
+ },
+ {
+ .cmd = IPVS_CMD_SET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_INFO,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_ZERO,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_FLUSH,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_set_cmd,
+ },
+};
+
+static int __init ip_vs_genl_register(void)
+{
+ return genl_register_family_with_ops(&ip_vs_genl_family,
+ ip_vs_genl_ops);
+}
+
+static void ip_vs_genl_unregister(void)
+{
+ genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+ ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+ ipvs->sysctl_pmtu_disc = 1;
+ tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
+
+
+ ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return -ENOMEM;
+ }
+ ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+
+ return 0;
+}
+
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
+}
+
+#else
+
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+static struct notifier_block ip_vs_dst_notifier = {
+ .notifier_call = ip_vs_dst_event,
+};
+
+int __net_init ip_vs_control_net_init(struct net *net)
+{
+ int i, idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->tot_stats.cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ipvs_tot_stats;
+ ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+ u64_stats_init(&ipvs_tot_stats->syncp);
+ }
+
+ spin_lock_init(&ipvs->tot_stats.lock);
+
+ proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ &ip_vs_stats_percpu_fops);
+
+ if (ip_vs_control_net_init_sysctl(net))
+ goto err;
+
+ return 0;
+
+err:
+ free_percpu(ipvs->tot_stats.cpustats);
+ return -ENOMEM;
+}
+
+void __net_exit ip_vs_control_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_control_net_cleanup_sysctl(net);
+ remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
+ remove_proc_entry("ip_vs_stats", net->proc_net);
+ remove_proc_entry("ip_vs", net->proc_net);
+ free_percpu(ipvs->tot_stats.cpustats);
+}
+
+int __init ip_vs_register_nl_ioctl(void)
+{
+ int ret;
+
+ ret = nf_register_sockopt(&ip_vs_sockopts);
+ if (ret) {
+ pr_err("cannot register sockopt.\n");
+ goto err_sock;
+ }
+
+ ret = ip_vs_genl_register();
+ if (ret) {
+ pr_err("cannot register Generic Netlink interface.\n");
+ goto err_genl;
+ }
+ return 0;
+
+err_genl:
+ nf_unregister_sockopt(&ip_vs_sockopts);
+err_sock:
+ return ret;
+}
+
+void ip_vs_unregister_nl_ioctl(void)
+{
+ ip_vs_genl_unregister();
+ nf_unregister_sockopt(&ip_vs_sockopts);
+}
+
+int __init ip_vs_control_init(void)
+{
+ int idx;
+ int ret;
+
+ EnterFunction(2);
+
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+
+ smp_wmb(); /* Do we really need it now ? */
+
+ ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+ if (ret < 0)
+ return ret;
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+ EnterFunction(2);
+ unregister_netdevice_notifier(&ip_vs_dst_notifier);
+ LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
new file mode 100644
index 00000000000..c3b84546ea9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,276 @@
+/*
+ * IPVS: Destination Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * Inspired by the consistent hashing scheduler patch from
+ * Thomas Proell <proellt@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[dest_ip];
+ * if (n is dead) OR
+ * (n is overloaded) OR (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS 8
+#endif
+#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
+
+/*
+ * Returns hash value for IPVS DH entry
+ */
+static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
+{
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s;
+
+ /* allocate the DH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+}
+
+
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_dh_state *s;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+ .name = "dh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+ .init_service = ip_vs_dh_init_svc,
+ .done_service = ip_vs_dh_done_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
+ .schedule = ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 00000000000..1425e9a924c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,211 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ Currently, the measurement is activated by slow timer handler. Hope
+ this measurement will not introduce too much load.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * The stored value for average bps is scaled by 2^5, so that maximal
+ rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+ * A lot code is taken from net/sched/estimator.c
+ */
+
+
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+ struct ip_vs_cpu_stats __percpu *stats)
+{
+ int i;
+ bool add = false;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+ if (add) {
+ sum->conns += s->ustats.conns;
+ sum->inpkts += s->ustats.inpkts;
+ sum->outpkts += s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ inbytes = s->ustats.inbytes;
+ outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ add = true;
+ sum->conns = s->ustats.conns;
+ sum->inpkts = s->ustats.inpkts;
+ sum->outpkts = s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ sum->inbytes = s->ustats.inbytes;
+ sum->outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ }
+ }
+}
+
+
+static void estimation_timer(unsigned long arg)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_stats *s;
+ u32 n_conns;
+ u32 n_inpkts, n_outpkts;
+ u64 n_inbytes, n_outbytes;
+ u32 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_ipvs(net);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
+ s = container_of(e, struct ip_vs_stats, est);
+
+ spin_lock(&s->lock);
+ ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
+ n_conns = s->ustats.conns;
+ n_inpkts = s->ustats.inpkts;
+ n_outpkts = s->ustats.outpkts;
+ n_inbytes = s->ustats.inbytes;
+ n_outbytes = s->ustats.outbytes;
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (n_conns - e->last_conns) << 9;
+ e->last_conns = n_conns;
+ e->cps += ((long)rate - (long)e->cps) >> 2;
+
+ rate = (n_inpkts - e->last_inpkts) << 9;
+ e->last_inpkts = n_inpkts;
+ e->inpps += ((long)rate - (long)e->inpps) >> 2;
+
+ rate = (n_outpkts - e->last_outpkts) << 9;
+ e->last_outpkts = n_outpkts;
+ e->outpps += ((long)rate - (long)e->outpps) >> 2;
+
+ rate = (n_inbytes - e->last_inbytes) << 4;
+ e->last_inbytes = n_inbytes;
+ e->inbps += ((long)rate - (long)e->inbps) >> 2;
+
+ rate = (n_outbytes - e->last_outbytes) << 4;
+ e->last_outbytes = n_outbytes;
+ e->outbps += ((long)rate - (long)e->outbps) >> 2;
+ spin_unlock(&s->lock);
+ }
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ INIT_LIST_HEAD(&est->list);
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_del(&est->list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_stats_user *u = &stats->ustats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = u->inbytes;
+ est->last_outbytes = u->outbytes;
+ est->last_conns = u->conns;
+ est->last_inpkts = u->inpkts;
+ est->last_outpkts = u->outpkts;
+ est->cps = 0;
+ est->inpps = 0;
+ est->outpps = 0;
+ est->inbps = 0;
+ est->outbps = 0;
+}
+
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
+ struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init ip_vs_estimator_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ return 0;
+}
+
+void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
new file mode 100644
index 00000000000..77c173282f3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,501 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/gfp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 "
+#define CLIENT_STRING "PORT"
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned int ports_count = 1;
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, &ports_count, 0444);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/* Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
+ return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern", ignoring before "skip" and terminated with
+ * the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ __be32 *addr, __be16 *port,
+ char **start, char **end)
+{
+ char *s, c;
+ unsigned char p[6];
+ int i = 0;
+
+ if (data_limit - data < plen) {
+ /* check if there is partial match */
+ if (strnicmp(data, pattern, data_limit - data) == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ if (strnicmp(data, pattern, plen) != 0) {
+ return 0;
+ }
+ s = data + plen;
+ if (skip) {
+ int found = 0;
+
+ for (;; s++) {
+ if (s == data_limit)
+ return -1;
+ if (!found) {
+ if (*s == skip)
+ found = 1;
+ } else if (*s != skip) {
+ break;
+ }
+ }
+ }
+
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ if (*data == term)
+ break;
+ }
+ *end = data;
+
+ memset(p, 0, sizeof(p));
+ for (data = s; ; data++) {
+ c = *data;
+ if (c == term)
+ break;
+ if (c >= '0' && c <= '9') {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ } else {
+ /* unexpected character */
+ return -1;
+ }
+ }
+
+ if (i != 5)
+ return -1;
+
+ *start = s;
+ *addr = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
+}
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port. Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ char *start, *end;
+ union nf_inet_addr from;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned int buf_len;
+ int ret = 0;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct net *net;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ *diff = 0;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (cp->app_data == &ip_vs_ftp_pasv) {
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING,
+ sizeof(SERVER_STRING)-1,
+ '(', ')',
+ &from.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ &from.ip, ntohs(port), &cp->caddr.ip, 0);
+
+ /*
+ * Now update or create an connection entry for it
+ */
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Replace the old passive address with the new one
+ */
+ from.ip = n_cp->vaddr.ip;
+ port = n_cp->vport;
+ snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&from.ip)[0],
+ ((unsigned char *)&from.ip)[1],
+ ((unsigned char *)&from.ip)[2],
+ ((unsigned char *)&from.ip)[3],
+ ntohs(port) >> 8,
+ ntohs(port) & 0xFF);
+
+ buf_len = strlen(buf);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
+ */
+ rcu_read_lock();
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ iph->ihl * 4,
+ start-data, end-start,
+ buf, buf_len);
+ rcu_read_unlock();
+ if (ret) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
+ }
+
+ /*
+ * Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ net = skb_net(skb);
+ cp->app_data = NULL;
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_start, *data_limit;
+ char *start, *end;
+ union nf_inet_addr to;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ struct net *net;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ /* no diff required for incoming packets */
+ *diff = 0;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ /*
+ * Detecting whether it is passive
+ */
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Since there may be OPTIONS in the TCP packet and the HLEN is
+ the length of the header in 32-bit multiples, it is accurate
+ to calculate data address by th+HLEN*4 */
+ data = data_start = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ while (data <= data_limit - 6) {
+ if (strnicmp(data, "PASV\r\n", 6) == 0) {
+ /* Passive mode on */
+ IP_VS_DBG(7, "got PASV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = &ip_vs_ftp_pasv;
+ return 1;
+ }
+ data++;
+ }
+
+ /*
+ * To support virtual FTP server, the scenerio is as follows:
+ * FTP client ----> Load Balancer ----> FTP server
+ * First detect the port number in the application data,
+ * then create a new connection entry for the coming data
+ * connection.
+ */
+ if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+ ' ', '\r', &to.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+
+ /* Passive mode off */
+ cp->app_data = NULL;
+
+ /*
+ * Now update or create a connection entry for it
+ */
+ IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
+ ip_vs_proto_name(iph->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
+ n_cp = ip_vs_conn_in_get(&p);
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(&p, &cp->daddr,
+ htons(ntohs(cp->dport)-1),
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+ }
+
+ /*
+ * Move tunnel to listen state
+ */
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+
+ return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+ .name = "ftp",
+ .type = IP_VS_APP_TYPE_FTP,
+ .protocol = IPPROTO_TCP,
+ .module = THIS_MODULE,
+ .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+ .init_conn = ip_vs_ftp_init_conn,
+ .done_conn = ip_vs_ftp_done_conn,
+ .bind_conn = NULL,
+ .unbind_conn = NULL,
+ .pkt_out = ip_vs_ftp_out,
+ .pkt_in = ip_vs_ftp_in,
+};
+
+/*
+ * per netns ip_vs_ftp initialization
+ */
+static int __net_init __ip_vs_ftp_init(struct net *net)
+{
+ int i, ret;
+ struct ip_vs_app *app;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ app = register_ip_vs_app(net, &ip_vs_ftp);
+ if (IS_ERR(app))
+ return PTR_ERR(app);
+
+ for (i = 0; i < ports_count; i++) {
+ if (!ports[i])
+ continue;
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+ if (ret)
+ goto err_unreg;
+ pr_info("%s: loaded support on port[%d] = %d\n",
+ app->name, i, ports[i]);
+ }
+ return 0;
+
+err_unreg:
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+ return ret;
+}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+
+static int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
+ return rv;
+}
+
+/*
+ * ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
new file mode 100644
index 00000000000..547ff33c1ef
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,633 @@
+/*
+ * IPVS: Locality-Based Least-Connection scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Martin Hamilton : fixed the terrible locking bugs
+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
+ * Wensong Zhang : fixed the uninitialized tbl->lock bug
+ * Wensong Zhang : added doing full expiration check to
+ * collect stale entries of 24+ hours when
+ * no partial expire check in a half hour
+ * Julian Anastasov : replaced del_timer call with del_timer_sync
+ * to avoid the possible race between timer
+ * handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ * if cachenode[dest_ip] is null then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- cachenode[dest_ip];
+ * if (n is dead) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ * return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblc entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+
+/*
+ * for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
+#endif
+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ * IPVS lblc entry represents an association between destination
+ * IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest *dest; /* real server (cache) */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+/*
+ * IPVS LBLC sysctl table
+ */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblc_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
+
+ ip_vs_dest_put_and_free(en->dest);
+ kfree(en);
+}
+
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
+
+/*
+ * Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned int
+ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblc_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(af, addr);
+ struct ip_vs_lblc_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblc_entry *en;
+
+ en = ip_vs_lblc_get(dest->af, tbl, daddr);
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = dest->af;
+ ip_vs_addr_copy(dest->af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
+
+ ip_vs_lblc_hash(tbl, en);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ int i;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblc_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ unsigned long now = jiffies;
+ int i, j;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now,
+ en->lastuse +
+ sysctl_lblc_expiration(svc)))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblc table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblc_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblc_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblc_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblc_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest = NULL;
+ struct ip_vs_lblc_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ /* We only hold a read lock, but this is atomic */
+ en->lastuse = jiffies;
+
+ /*
+ * If the destination is not available, i.e. it's in the trash,
+ * we must ignore it, as it may be removed from under our feet,
+ * if someone drops our reference count. Our caller only makes
+ * sure that destinations, that are not in the trash, are not
+ * moved to the trash, while we are scheduling. But anyone can
+ * free up entries from the trash at any time.
+ */
+
+ dest = en->dest;
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
+ }
+
+ /* No cache entry or it is invalid, time to schedule */
+ dest = __ip_vs_lblc_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
+ .name = "lblc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+ .init_service = ip_vs_lblc_init_svc,
+ .done_service = ip_vs_lblc_done_svc,
+ .schedule = ip_vs_lblc_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblc_ctl_table[0].procname = NULL;
+
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+ ipvs->lblc_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
+
+static int __init ip_vs_lblc_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
new file mode 100644
index 00000000000..3f21a2f47de
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,818 @@
+/*
+ * IPVS: Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Julian Anastasov : Added the missing (dest->weight>0)
+ * condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ * if serverSet[dest_ip] is null then
+ * n, serverSet[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
+ * if (n is null) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n <- {weighted least-conn node};
+ * add n to serverSet[dest_ip];
+ * if |serverSet[dest_ip]| > 1 AND
+ * now - serverSet[dest_ip].lastMod > T then
+ * m <- {most conn node in serverSet[dest_ip]};
+ * remove m from serverSet[dest_ip];
+ * if serverSet[dest_ip] changed then
+ * serverSet[dest_ip].lastMod <- now;
+ *
+ * return n;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblcr entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+/*
+ * for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
+#endif
+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ * IPVS destination set structure and operations
+ */
+struct ip_vs_dest_set_elem {
+ struct list_head list; /* list link */
+ struct ip_vs_dest *dest; /* destination server */
+ struct rcu_head rcu_head;
+};
+
+struct ip_vs_dest_set {
+ atomic_t size; /* set size */
+ unsigned long lastmod; /* last modified time */
+ struct list_head list; /* destination list */
+};
+
+
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest)
+ return;
+ }
+ }
+
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
+ if (e == NULL)
+ return;
+
+ ip_vs_dest_hold(dest);
+ e->dest = dest;
+
+ list_add_rcu(&e->list, &set->list);
+ atomic_inc(&set->size);
+
+ set->lastmod = jiffies;
+}
+
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put_and_free(e->dest);
+ kfree(e);
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest) {
+ /* HIT */
+ atomic_dec(&set->size);
+ set->lastmod = jiffies;
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ break;
+ }
+ }
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+ struct ip_vs_dest_set_elem *e, *ep;
+
+ list_for_each_entry_safe(e, ep, &set->list, list) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ }
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry_rcu(e, &set->list, list) {
+ least = e->dest;
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if ((atomic_read(&least->weight) > 0)
+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted least load */
+ nextstage:
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
+ dest = e->dest;
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+ return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *most;
+ int moh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry(e, &set->list, list) {
+ most = e->dest;
+ if (atomic_read(&most->weight) > 0) {
+ moh = ip_vs_dest_conn_overhead(most);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted most load */
+ nextstage:
+ list_for_each_entry_continue(e, &set->list, list) {
+ dest = e->dest;
+ doh = ip_vs_dest_conn_overhead(dest);
+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
+ && (atomic_read(&dest->weight) > 0)) {
+ most = dest;
+ moh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
+ atomic_read(&most->activeconns),
+ atomic_read(&most->refcnt),
+ atomic_read(&most->weight), moh);
+ return most;
+}
+
+
+/*
+ * IPVS lblcr entry represents an association between destination
+ * IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest_set set; /* destination server set */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+#ifdef CONFIG_SYSCTL
+/*
+ * IPVS LBLCR sysctl table
+ */
+
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblcr_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ ip_vs_dest_set_eraseall(&en->set);
+ kfree_rcu(en, rcu_head);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned int
+ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
+ struct ip_vs_lblcr_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = ip_vs_lblcr_get(dest->af, tbl, daddr);
+ if (!en) {
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = dest->af;
+ ip_vs_addr_copy(dest->af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ /* initialize its dest set */
+ atomic_set(&(en->set.size), 0);
+ INIT_LIST_HEAD(&en->set.list);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
+
+ ip_vs_lblcr_hash(tbl, en);
+ return en;
+ }
+
+ ip_vs_dest_set_insert(&en->set, dest, true);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ int i;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblcr_free(en);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblcr_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_after(en->lastuse +
+ sysctl_lblcr_expiration(svc), now))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblcr table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblcr_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblcr_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblcr_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblcr_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblcr_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ en->lastuse = jiffies;
+
+ /* Get the least loaded destination */
+ dest = ip_vs_dest_set_min(&en->set);
+
+ /* More than one destination + enough time passed by, cleanup */
+ if (atomic_read(&en->set.size) > 1 &&
+ time_after(jiffies, en->set.lastmod +
+ sysctl_lblcr_expiration(svc))) {
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
+
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
+ }
+
+ /* If the destination is not overloaded, use it */
+ if (dest && !is_overloaded(dest, svc))
+ goto out;
+
+ /* The cache entry is invalid, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Update our cache entry */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
+ goto out;
+ }
+
+ /* No cache entry, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+ .name = "lblcr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+ .init_service = ip_vs_lblcr_init_svc,
+ .done_service = ip_vs_lblcr_done_svc,
+ .schedule = ip_vs_lblcr_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblcr_ctl_table[0].procname = NULL;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
+
+static int __init ip_vs_lblcr_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
new file mode 100644
index 00000000000..2bdcb1cf212
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,93 @@
+/*
+ * IPVS: Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : added the ip_vs_lc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ unsigned int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * Simply select the server with the least number of
+ * (activeconns<<5) + inactconns
+ * Except whose weight is equal to zero.
+ * If the weight is equal to zero, it means that the server is
+ * quiesced, the existing connections to the server still get
+ * served, but no new connection is assigned to the server.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->weight) == 0)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (!least || doh < loh) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least)
+ ip_vs_scheduler_err(svc, "no destination available");
+ else
+ IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
+ "inactconns %d\n",
+ IP_VS_DBG_ADDR(svc->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->inactconns));
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+ .name = "lc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+ .schedule = ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 00000000000..5882bbfd198
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,299 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /* Applications may adjust TCP seqs */
+ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP &&
+ !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct))
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(&p);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
new file mode 100644
index 00000000000..961a6de9bb2
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,142 @@
+/*
+ * IPVS: Never Queue scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+ !atomic_read(&dest->weight))
+ continue;
+
+ doh = ip_vs_nq_dest_overhead(dest);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&dest->activeconns) == 0) {
+ least = dest;
+ loh = doh;
+ goto out;
+ }
+
+ if (!least ||
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ out:
+ IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+ .name = "nq",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+ .schedule = ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 00000000000..1a82b29ce8e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,111 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
+
+/* Get pe in the pe list by name */
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
+{
+ struct ip_vs_pe *pe;
+
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
+ pe_name);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
+ /* Test and get the modules atomically */
+ if (pe->module &&
+ !try_module_get(pe->module)) {
+ /* This pe is just deleted */
+ continue;
+ }
+ if (strcmp(pe_name, pe->name)==0) {
+ /* HIT */
+ rcu_read_unlock();
+ return pe;
+ }
+ if (pe->module)
+ module_put(pe->module);
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
+{
+ struct ip_vs_pe *pe;
+
+ /* Search for the pe by name */
+ pe = __ip_vs_pe_getbyname(name);
+
+ /* If pe not found, load the module and search again */
+ if (!pe) {
+ request_module("ip_vs_pe_%s", name);
+ pe = __ip_vs_pe_getbyname(name);
+ }
+
+ return pe;
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ struct ip_vs_pe *tmp;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Make sure that the pe with this name doesn't exist
+ * in the pe list.
+ */
+ list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+ if (strcmp(tmp->name, pe->name) == 0) {
+ mutex_unlock(&ip_vs_pe_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already existed "
+ "in the system\n", __func__, pe->name);
+ return -EINVAL;
+ }
+ }
+ /* Add it into the d-linked pe list */
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ pr_info("[%s] pe registered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Remove it from the d-linked pe list */
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] pe unregistered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 00000000000..bed5f704252
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,171 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return -EINVAL;
+ dataoff += *matchoff;
+ }
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+ int retc;
+
+ ip_vs_fill_iph_skb(p->af, skb, &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+ /* todo: IPv6 fragments:
+ * I think this only should be done for the first fragment. /HS
+ */
+ dataoff = iph.len + sizeof(struct udphdr);
+
+ if (dataoff >= skb->len)
+ return -EINVAL;
+ retc = skb_linearize(skb);
+ if (retc < 0)
+ return retc;
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = false;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = true;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
new file mode 100644
index 00000000000..939f7fbe9b4
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,409 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ * register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp->next = ip_vs_proto_table[hash];
+ ip_vs_proto_table[hash] = pp;
+
+ if (pp->init != NULL)
+ pp->init(pp);
+
+ return 0;
+}
+
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
+
+ if (!pd)
+ return -ENOMEM;
+
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL) {
+ int ret = pp->init_netns(net, pd);
+ if (ret) {
+ /* unlink an free proto data */
+ ipvs->proto_data_table[hash] = pd->next;
+ kfree(pd);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ struct ip_vs_protocol **pp_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp_p = &ip_vs_proto_table[hash];
+ for (; *pp_p; pp_p = &(*pp_p)->next) {
+ if (*pp_p == pp) {
+ *pp_p = pp->next;
+ if (pp->exit != NULL)
+ pp->exit(pp);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+ struct ip_vs_protocol *pp;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+ if (pp->protocol == proto)
+ return pp;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_get);
+
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+static struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
+
+/*
+ * Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
+{
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
+ }
+ }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+ return kmemdup(table, size, GFP_KERNEL);
+}
+
+
+/*
+ * Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, const char *const *names,
+ const char *name, int to)
+{
+ int i;
+
+ if (!table || !name || !to)
+ return -EINVAL;
+
+ for (i = 0; i < num; i++) {
+ if (strcmp(names[i], name))
+ continue;
+ table[i] = to * HZ;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ if (pp == NULL || pp->state_name == NULL)
+ return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+ return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[128];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->frag_off & htons(IP_OFFSET))
+ sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI4->%pI4",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI4:%u->%pI4:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[192];
+ struct ipv6hdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->nexthdr == IPPROTO_FRAGMENT)
+ sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI6c->%pI6c",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI6c:%u->%pI6c:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+ else
+#endif
+ ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+/*
+ * per network name-space init
+ */
+int __net_init ip_vs_protocol_net_init(struct net *net)
+{
+ int i, ret;
+ static struct ip_vs_protocol *protos[] = {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ &ip_vs_protocol_tcp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ &ip_vs_protocol_udp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ &ip_vs_protocol_sctp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ &ip_vs_protocol_ah,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ &ip_vs_protocol_esp,
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(protos); i++) {
+ ret = register_ip_vs_proto_netns(net, protos[i]);
+ if (ret < 0)
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ ip_vs_protocol_net_cleanup(net);
+ return ret;
+}
+
+void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+int __init ip_vs_protocol_init(void)
+{
+ char protocols[64];
+#define REGISTER_PROTOCOL(p) \
+ do { \
+ register_ip_vs_protocol(p); \
+ strcat(protocols, ", "); \
+ strcat(protocols, (p)->name); \
+ } while (0)
+
+ protocols[0] = '\0';
+ protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+ pr_info("Registered protocols (%s)\n", &protocols[2]);
+
+ return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ /* unregister all the ipvs protocols */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pp = ip_vs_proto_table[i]) != NULL)
+ unregister_ip_vs_protocol(pp);
+ }
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 00000000000..5de3dd312c0
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,165 @@
+/*
+ * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+static void
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
+{
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->saddr, htons(PORT_ISAKMP),
+ &iph->daddr, htons(PORT_ISAKMP), p);
+ else
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->daddr, htons(PORT_ISAKMP),
+ &iph->saddr, htons(PORT_ISAKMP), p);
+}
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_in_get(&p);
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (!cp) {
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ /*
+ * AH/ESP is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+ .name = "AH",
+ .protocol = IPPROTO_AH,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
new file mode 100644
index 00000000000..2f7ea756404
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -0,0 +1,591 @@
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sctp/checksum.h>
+#include <net/ip_vs.h>
+
+static int
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs;
+ sctp_chunkhdr_t _schunkh, *sch;
+ sctp_sctphdr_t *sh, _sctph;
+
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(_schunkh), &_schunkh);
+ if (sch == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ rcu_read_lock();
+ if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
+ unsigned int sctphoff)
+{
+ sctph->checksum = sctp_compute_cksum(skb, sctphoff);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
+static int
+sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int sctphoff;
+ struct sctphdr *sh, _sctph;
+ __le32 cmp, val;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ sctphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ sctphoff = ip_hdrlen(skb);
+
+ sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return 0;
+
+ cmp = sh->checksum;
+ val = sctp_compute_cksum(skb, sctphoff);
+
+ if (val != cmp) {
+ /* CRC failure, dump it. */
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ return 1;
+}
+
+enum ipvs_sctp_event_t {
+ IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */
+ IP_VS_SCTP_INIT,
+ IP_VS_SCTP_INIT_ACK,
+ IP_VS_SCTP_COOKIE_ECHO,
+ IP_VS_SCTP_COOKIE_ACK,
+ IP_VS_SCTP_SHUTDOWN,
+ IP_VS_SCTP_SHUTDOWN_ACK,
+ IP_VS_SCTP_SHUTDOWN_COMPLETE,
+ IP_VS_SCTP_ERROR,
+ IP_VS_SCTP_ABORT,
+ IP_VS_SCTP_EVENT_LAST
+};
+
+/* RFC 2960, 3.2 Chunk Field Descriptions */
+static __u8 sctp_events[] = {
+ [SCTP_CID_DATA] = IP_VS_SCTP_DATA,
+ [SCTP_CID_INIT] = IP_VS_SCTP_INIT,
+ [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK,
+ [SCTP_CID_SACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT,
+ [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN,
+ [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK,
+ [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR,
+ [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO,
+ [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK,
+ [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA,
+ [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE,
+};
+
+/* SCTP States:
+ * See RFC 2960, 4. SCTP Association State Diagram
+ *
+ * New states (not in diagram):
+ * - INIT1 state: use shorter timeout for dropped INIT packets
+ * - REJECTED state: use shorter timeout if INIT is rejected with ABORT
+ * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging
+ *
+ * The states are as seen in real server. In the diagram, INIT1, INIT,
+ * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state.
+ *
+ * States as per packets from client (C) and server (S):
+ *
+ * Setup of client connection:
+ * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK
+ *
+ * Setup of server connection:
+ * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK
+ */
+
+#define sNO IP_VS_SCTP_S_NONE
+#define sI1 IP_VS_SCTP_S_INIT1
+#define sIN IP_VS_SCTP_S_INIT
+#define sCS IP_VS_SCTP_S_COOKIE_SENT
+#define sCR IP_VS_SCTP_S_COOKIE_REPLIED
+#define sCW IP_VS_SCTP_S_COOKIE_WAIT
+#define sCO IP_VS_SCTP_S_COOKIE
+#define sCE IP_VS_SCTP_S_COOKIE_ECHOED
+#define sES IP_VS_SCTP_S_ESTABLISHED
+#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT
+#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED
+#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
+#define sRJ IP_VS_SCTP_S_REJECTED
+#define sCL IP_VS_SCTP_S_CLOSED
+
+static const __u8 sctp_states
+ [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = {
+ { /* INPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* OUTPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW},
+/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL},
+/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* INPUT-ONLY */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+};
+
+#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ)
+
+/* Timeout table[state] */
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = 2 * HZ,
+ [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_LAST] = 2 * HZ,
+};
+
+static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = "NONE",
+ [IP_VS_SCTP_S_INIT1] = "INIT1",
+ [IP_VS_SCTP_S_INIT] = "INIT",
+ [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT",
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED",
+ [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT",
+ [IP_VS_SCTP_S_COOKIE] = "COOKIE",
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED",
+ [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT",
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED",
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT",
+ [IP_VS_SCTP_S_REJECTED] = "REJECTED",
+ [IP_VS_SCTP_S_CLOSED] = "CLOSED",
+ [IP_VS_SCTP_S_LAST] = "BUG!",
+};
+
+
+static const char *sctp_state_name(int state)
+{
+ if (state >= IP_VS_SCTP_S_LAST)
+ return "ERR!";
+ if (sctp_state_name_table[state])
+ return sctp_state_name_table[state];
+ return "?";
+}
+
+static inline void
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, const struct sk_buff *skb)
+{
+ sctp_chunkhdr_t _sctpch, *sch;
+ unsigned char chunk_type;
+ int event, next_state;
+ int ihl, cofs;
+
+#ifdef CONFIG_IP_VS_IPV6
+ ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ ihl = ip_hdrlen(skb);
+#endif
+
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
+ if (sch == NULL)
+ return;
+
+ chunk_type = sch->type;
+ /*
+ * Section 3: Multiple chunks can be bundled into one SCTP packet
+ * up to the MTU size, except for the INIT, INIT ACK, and
+ * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with
+ * any other chunk in a packet.
+ *
+ * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control
+ * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be
+ * bundled with an ABORT, but they MUST be placed before the ABORT
+ * in the SCTP packet or they will be ignored by the receiver.
+ */
+ if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
+ (sch->type == SCTP_CID_COOKIE_ACK)) {
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
+ chunk_type = sch->type;
+ }
+ }
+
+ event = (chunk_type < sizeof(sctp_events)) ?
+ sctp_events[chunk_type] : IP_VS_SCTP_DATA;
+
+ /* Update direction to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (direction == IP_VS_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ direction = IP_VS_DIR_INPUT_ONLY;
+ }
+
+ next_state = sctp_states[direction][event][cp->state];
+
+ if (next_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((direction == IP_VS_DIR_OUTPUT) ?
+ "output " : "input "),
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ sctp_state_name(cp->state),
+ sctp_state_name(next_state),
+ atomic_read(&cp->refcnt));
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state == IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
+}
+
+static void
+sctp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
+{
+ spin_lock_bh(&cp->lock);
+ set_sctp_state(pd, cp, direction, skb);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 sctp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
+ & SCTP_APP_TAB_MASK;
+}
+
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ hash = sctp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+out:
+
+ return ret;
+}
+
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+static int sctp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+ /* Lookup application incarnations and bind the right one */
+ hash = sctp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+out:
+ return result;
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+struct ip_vs_protocol ip_vs_protocol_sctp = {
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
+ .unregister_app = sctp_unregister_app,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
+ .state_transition = sctp_state_transition,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 00000000000..e3a697234a9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,712 @@
+/*
+ * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct tcphdr _tcph, *th;
+ struct netns_ipvs *ipvs;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+ rcu_read_lock();
+ if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->source = cp->vport;
+
+ /* Adjust TCP checksums */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, tcph->check,
+ (char*)&(tcph->check) - (char*)tcph);
+ }
+ return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->dest = cp->dport;
+
+ /*
+ * Adjust TCP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ tcphoff = ip_hdrlen(skb);
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+
+ return 1;
+}
+
+
+#define TCP_DIR_INPUT 0
+#define TCP_DIR_OUTPUT 4
+#define TCP_DIR_INPUT_ONLY 8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+ [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
+ [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
+ [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ * Timeout table[state]
+ */
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 120*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = "NONE",
+ [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
+ [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
+ [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
+ [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
+ [IP_VS_TCP_S_CLOSE] = "CLOSE",
+ [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
+ [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
+ [IP_VS_TCP_S_LISTEN] = "LISTEN",
+ [IP_VS_TCP_S_SYNACK] = "SYNACK",
+ [IP_VS_TCP_S_LAST] = "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+ int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return "ERR!";
+ return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
+{
+ int on = (flags & 1); /* secure_tcp */
+
+ /*
+ ** FIXME: change secure_tcp to independent sysctl var
+ ** or make it per-service or per-app because it is valid
+ ** for most if not for all of the applications. Something
+ ** like "capabilities" (flags) for each object.
+ */
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+ if (th->rst)
+ return 3;
+ if (th->syn)
+ return 0;
+ if (th->fin)
+ return 1;
+ if (th->ack)
+ return 2;
+ return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, struct tcphdr *th)
+{
+ int state_idx;
+ int new_state = IP_VS_TCP_S_CLOSE;
+ int state_off = tcp_state_off[direction];
+
+ /*
+ * Update state offset to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (state_off == TCP_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ state_off = TCP_DIR_INPUT_ONLY;
+ }
+
+ if ((state_idx = tcp_state_idx(th)) < 0) {
+ IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+ tcp_state_out:
+ if (new_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((state_off == TCP_DIR_OUTPUT) ?
+ "output " : "input "),
+ th->syn ? 'S' : '.',
+ th->fin ? 'F' : '.',
+ th->ack ? 'A' : '.',
+ th->rst ? 'R' : '.',
+ IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ tcp_state_name(cp->state),
+ tcp_state_name(new_state),
+ atomic_read(&cp->refcnt));
+
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
+}
+
+/*
+ * Handle state transitions
+ */
+static void
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+ int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ int ihl = ip_hdrlen(skb);
+#endif
+
+ th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return;
+
+ spin_lock_bh(&cp->lock);
+ set_tcp_state(pd, cp, direction, th);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+ & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ hash = tcp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = tcp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+/*
+ * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&cp->lock);
+ cp->state = IP_VS_TCP_S_LISTEN;
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
+ spin_unlock_bh(&cp->lock);
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ pd->tcp_state_table = tcp_states;
+ return 0;
+}
+
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+ .name = "TCP",
+ .protocol = IPPROTO_TCP,
+ .num_states = IP_VS_TCP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
+ .register_app = tcp_register_app,
+ .unregister_app = tcp_unregister_app,
+ .conn_schedule = tcp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = tcp_snat_handler,
+ .dnat_handler = tcp_dnat_handler,
+ .csum_check = tcp_csum_check,
+ .state_name = tcp_state_name,
+ .state_transition = tcp_state_transition,
+ .app_conn_bind = tcp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = tcp_timeout_change,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 00000000000..b62a3c0ff9b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,499 @@
+/*
+ * ip_vs_proto_udp.c: UDP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct udphdr _udph, *uh;
+
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
+ if (svc) {
+ int ignored;
+
+ if (ip_vs_todrop(net_ipvs(net))) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ if (!uhdr->check)
+ uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Call application helper if needed
+ */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->source = cp->vport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, udph->check,
+ (char*)&(udph->check) - (char*)udph);
+ }
+ return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->dest = cp->dport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ struct udphdr _udph, *uh;
+ unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ udphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ udphoff = ip_hdrlen(skb);
+
+ uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ return 0;
+
+ if (uh->check != 0) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, udphoff,
+ skb->len - udphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+ }
+ return 1;
+}
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+ & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ hash = udp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = udp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
+ [IP_VS_UDP_S_LAST] = 2*HZ,
+};
+
+static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = "UDP",
+ [IP_VS_UDP_S_LAST] = "BUG!",
+};
+
+static const char * udp_state_name(int state)
+{
+ if (state >= IP_VS_UDP_S_LAST)
+ return "ERR!";
+ return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static void
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+}
+
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+ .name = "UDP",
+ .protocol = IPPROTO_UDP,
+ .num_states = IP_VS_UDP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
+ .conn_schedule = udp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = udp_snat_handler,
+ .dnat_handler = udp_dnat_handler,
+ .csum_check = udp_csum_check,
+ .state_transition = udp_state_transition,
+ .state_name = udp_state_name,
+ .register_app = udp_register_app,
+ .unregister_app = udp_unregister_app,
+ .app_conn_bind = udp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
new file mode 100644
index 00000000000..176b87c35e3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,130 @@
+/*
+ * IPVS: Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_rr_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ do {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
+ }
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ out:
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
+ IP_VS_DBG_BUF(6, "RR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+ .name = "rr", /* name */
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+ .init_service = ip_vs_rr_init_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
+ .schedule = ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
new file mode 100644
index 00000000000..4dbcda6258b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,255 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(ip_vs_scheduler_err);
+/*
+ * IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
+
+
+/*
+ * Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler)
+{
+ int ret;
+
+ if (scheduler->init_service) {
+ ret = scheduler->init_service(svc);
+ if (ret) {
+ pr_err("%s(): init error\n", __func__);
+ return ret;
+ }
+ }
+ rcu_assign_pointer(svc->scheduler, scheduler);
+ return 0;
+}
+
+
+/*
+ * Unbind a service with its scheduler
+ */
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
+{
+ struct ip_vs_scheduler *cur_sched;
+
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
+
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
+}
+
+
+/*
+ * Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ /*
+ * Test and get the modules atomically
+ */
+ if (sched->module && !try_module_get(sched->module)) {
+ /*
+ * This scheduler is just deleted
+ */
+ continue;
+ }
+ if (strcmp(sched_name, sched->name)==0) {
+ /* HIT */
+ mutex_unlock(&ip_vs_sched_mutex);
+ return sched;
+ }
+ if (sched->module)
+ module_put(sched->module);
+ }
+
+ mutex_unlock(&ip_vs_sched_mutex);
+ return NULL;
+}
+
+
+/*
+ * Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * Search for the scheduler by sched_name
+ */
+ sched = ip_vs_sched_getbyname(sched_name);
+
+ /*
+ * If scheduler not found, load the module and search again
+ */
+ if (sched == NULL) {
+ request_module("ip_vs_%s", sched_name);
+ sched = ip_vs_sched_getbyname(sched_name);
+ }
+
+ return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+ if (scheduler && scheduler->module)
+ module_put(scheduler->module);
+}
+
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ sched->name, svc->fwmark, svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
+
+/*
+ * Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!scheduler->name) {
+ pr_err("%s(): NULL scheduler_name\n", __func__);
+ return -EINVAL;
+ }
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ if (!list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already linked\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure that the scheduler with this name doesn't exist
+ * in the scheduler list.
+ */
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ if (strcmp(scheduler->name, sched->name) == 0) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already existed "
+ "in the system\n", __func__, scheduler->name);
+ return -EINVAL;
+ }
+ }
+ /*
+ * Add it into the d-linked scheduler list
+ */
+ list_add(&scheduler->n_list, &ip_vs_schedulers);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ pr_info("[%s] scheduler registered.\n", scheduler->name);
+
+ return 0;
+}
+
+
+/*
+ * Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&ip_vs_sched_mutex);
+ if (list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ pr_err("%s(): [%s] scheduler is not in the list. failed\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Remove it from the d-linked scheduler list
+ */
+ list_del(&scheduler->n_list);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] scheduler unregistered.\n", scheduler->name);
+
+ return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
new file mode 100644
index 00000000000..e446b9fa742
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,143 @@
+/*
+ * IPVS: Shortest Expected Delay scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_sed_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_sed_dest_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "SED: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+ .name = "sed",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+ .schedule = ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
new file mode 100644
index 00000000000..cc65b2f42cd
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,385 @@
+/*
+ * IPVS: Source Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[src_ip];
+ * if (n is dead) OR
+ * (n is overloaded) or (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ * The weight destination attribute can be used to control the
+ * distribution of connections to the destinations in servernode. The
+ * greater the weight, the more connections the destination
+ * will receive.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
+
+/*
+ * IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS 8
+#endif
+#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/*
+ * Returns hash value for IPVS SH entry
+ */
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, unsigned int offset)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
+ dest = rcu_dereference(s->buckets[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ int d_count;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ d_count = 0;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ /* Don't move to next dest until filling weight */
+ if (++d_count >= atomic_read(&dest->weight)) {
+ p = p->next;
+ d_count = 0;
+ }
+
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s;
+
+ /* allocate the SH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+}
+
+
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 port;
+ struct tcphdr _tcph, *th;
+ struct udphdr _udph, *uh;
+ sctp_sctphdr_t _sctph, *sh;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (unlikely(th == NULL))
+ return 0;
+ port = th->source;
+ break;
+ case IPPROTO_UDP:
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (unlikely(uh == NULL))
+ return 0;
+ port = uh->source;
+ break;
+ case IPPROTO_SCTP:
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (unlikely(sh == NULL))
+ return 0;
+ port = sh->source;
+ break;
+ default:
+ port = 0;
+ }
+
+ return port;
+}
+
+
+/*
+ * Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_sh_state *s;
+ __be16 port = 0;
+
+ IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+ port = ip_vs_sh_get_port(skb, iph);
+
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+ dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ else
+ dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr),
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+ .name = "sh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+ .init_service = ip_vs_sh_init_svc,
+ .done_service = ip_vs_sh_done_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
+ .schedule = ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
new file mode 100644
index 00000000000..db801263ee9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,1948 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync: sync connection info from master load balancer to backups
+ * through multicast
+ *
+ * Changes:
+ * Alexandre Cassen : Added master & backup support at a time.
+ * Alexandre Cassen : Added SyncID support for incoming sync
+ * messages filtering.
+ * Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h> /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT 8848 /* multicast port */
+
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
+
+static struct lock_class_key __ipvs_sync_key;
+/*
+ * IPVS sync connection entry
+ * Version 0, i.e. original version.
+ */
+struct ip_vs_sync_conn_v0 {
+ __u8 reserved;
+
+ /* Protocol, addresses and port numbers */
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+
+ /* Flags and state transition */
+ __be16 flags; /* status flags */
+ __be16 state; /* state info */
+
+ /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
+struct ip_vs_sync_thread_data {
+ struct net *net;
+ struct socket *sock;
+ char *buf;
+ int id;
+};
+
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
+#define FULL_CONN_SIZE \
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (1) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | . |
+ ~ . ~
+ | . |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (n) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
+*/
+
+#define SYNC_MESG_HEADER_LEN 4
+#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
+ __u8 nr_conns;
+ __u8 syncid;
+ __be16 size;
+
+ /* ip_vs_sync_conn entries start here */
+};
+
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __be16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
+
+struct ip_vs_sync_buff {
+ struct list_head list;
+ unsigned long firstuse;
+
+ /* pointers for the message data */
+ struct ip_vs_sync_mesg *mesg;
+ unsigned char *head;
+ unsigned char *end;
+};
+
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ms->sync_queue)) {
+ sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ } else {
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
+ list);
+ list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+
+ return sb;
+}
+
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
+ sb->mesg->nr_conns = 0;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+ kfree(sb->mesg);
+ kfree(sb);
+}
+
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
+ ip_vs_sync_buff_release(sb);
+ spin_unlock(&ipvs->sync_lock);
+}
+
+/*
+ * Get the current sync buffer if it has been created for more
+ * than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
+ } else
+ sb = NULL;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return sb;
+}
+
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
+{
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/* Check if connection is controlled by persistence */
+static inline bool in_persistence(struct ip_vs_conn *cp)
+{
+ for (cp = cp->control; cp; cp = cp->control) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ return true;
+ }
+ return false;
+}
+
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
+ return 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
+ (1 << IP_VS_SCTP_S_CLOSED))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
+ * Add an ip_vs_conn information into the current sync_buff.
+ */
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ int len;
+
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ }
+
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
+
+ /* copy members */
+ s->reserved = 0;
+ s->protocol = cp->protocol;
+ s->cport = cp->cport;
+ s->vport = cp->vport;
+ s->dport = cp->dport;
+ s->caddr = cp->caddr.ip;
+ s->vaddr = cp->vaddr.ip;
+ s->daddr = cp->daddr.ip;
+ s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->state = htons(cp->state);
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ struct ip_vs_sync_conn_options *opt =
+ (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(opt, &cp->in_seq, sizeof(*opt));
+ }
+
+ m->nr_conns++;
+ m->size = htons(ntohs(m->size) + len);
+ buff->head += len;
+
+ /* check if there is a space for next one */
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (sysctl_sync_ver(ipvs) == 0) {
+ ip_vs_sync_conn_v0(net, cp, pkts);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ m = buff->mesg;
+ }
+
+ p = buff->head;
+ buff->head += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ s->v6.caddr = cp->caddr.in6;
+ s->v6.vaddr = cp->vaddr.in6;
+ s->v6.daddr = cp->daddr.in6;
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ goto sloop;
+}
+
+/*
+ * fill_param used by version 1
+ */
+static inline int
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ if (p->pe->module)
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ p->pe_data_len = pe_data_len;
+ }
+ return 0;
+}
+
+/*
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(param);
+ else
+ cp = ip_vs_ct_in_get(param);
+
+ if (cp) {
+ /* Free pe_data */
+ kfree(param->pe_data);
+
+ dest = cp->dest;
+ spin_lock_bh(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
+ } else {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ rcu_read_lock();
+ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+ param->vport, protocol, fwmark, flags);
+
+ cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+ rcu_read_unlock();
+ if (!cp) {
+ if (param->pe_data)
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ }
+
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
+ }
+ ip_vs_conn_put(cp);
+}
+
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
+ for (i=0; i<m->nr_conns; i++) {
+ unsigned int flags, state;
+
+ if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
+ return;
+ }
+ s = (struct ip_vs_sync_conn_v0 *) p;
+ flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+ flags &= ~IP_VS_CONN_F_HASHED;
+ if (flags & IP_VS_CONN_F_SEQ_MASK) {
+ opt = (struct ip_vs_sync_conn_options *)&s[1];
+ p += FULL_CONN_SIZE;
+ if (p > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
+ return;
+ }
+ } else {
+ opt = NULL;
+ p += SIMPLE_CONN_SIZE;
+ }
+
+ state = ntohs(s->state);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->protocol);
+ if (!pp) {
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
+ s->protocol);
+ continue;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
+ pp->name, state);
+ continue;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
+ }
+ }
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+
+ if (buflen != ntohs(m2->size)) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned int size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
+ }
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
+ return;
+ }
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
+ }
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
+ }
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
+ }
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
+ }
+}
+
+
+/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
+ * Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+ lock_sock(sk);
+ inet->mc_loop = loop ? 1 : 0;
+ release_sock(sk);
+}
+
+/*
+ * Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+ lock_sock(sk);
+ inet->mc_ttl = ttl;
+ release_sock(sk);
+}
+
+/*
+ * Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+ struct net_device *dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet->mc_index = dev->ifindex;
+ /* inet->mc_addr = 0; */
+ release_sock(sk);
+
+ return 0;
+}
+
+
+/*
+ * Set the maximum length of sync message according to the
+ * specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net_device *dev;
+ int num;
+
+ if (sync_state == IP_VS_STATE_MASTER) {
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ num = (dev->mtu - sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+ IP_VS_DBG(7, "setting the maximum length of sync sending "
+ "message %d.\n", ipvs->send_mesg_maxlen);
+ } else if (sync_state == IP_VS_STATE_BACKUP) {
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ ipvs->recv_mesg_maxlen = dev->mtu -
+ sizeof(struct iphdr) - sizeof(struct udphdr);
+ IP_VS_DBG(7, "setting the maximum length of sync receiving "
+ "message %d.\n", ipvs->recv_mesg_maxlen);
+ }
+
+ return 0;
+}
+
+
+/*
+ * Join a multicast group.
+ * the group is specified by a class D multicast address 224.0.0.0/8
+ * in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+ struct net *net = sock_net(sk);
+ struct ip_mreqn mreq;
+ struct net_device *dev;
+ int ret;
+
+ memset(&mreq, 0, sizeof(mreq));
+ memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ mreq.imr_ifindex = dev->ifindex;
+
+ lock_sock(sk);
+ ret = ip_mc_join_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+ struct net *net = sock_net(sock->sk);
+ struct net_device *dev;
+ __be32 addr;
+ struct sockaddr_in sin;
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ if (!addr)
+ pr_err("You probably need to specify IP address on "
+ "multicast interface.\n");
+
+ IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
+ ifname, &addr);
+
+ /* Now bind the socket with the address of multicast interface */
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+
+ return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ * Set up sending multicast socket over UDP
+ */
+static struct socket *make_send_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket move it to right name space later */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error setting outbound mcast interface\n");
+ goto error;
+ }
+
+ set_mcast_loop(sock->sk, 0);
+ set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
+
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error binding address of the mcast interface\n");
+ goto error;
+ }
+
+ result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr), 0);
+ if (result < 0) {
+ pr_err("Error connecting to the multicast addr\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+/*
+ * Set up receiving multicast socket over UDP
+ */
+static struct socket *make_receive_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ /* it is equivalent to the REUSEADDR option in user-space */
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
+
+ result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr));
+ if (result < 0) {
+ pr_err("Error binding to the multicast addr\n");
+ goto error;
+ }
+
+ /* join the multicast group */
+ result = join_mcast_group(sock->sk,
+ (struct in_addr *) &mcast_addr.sin_addr,
+ ipvs->backup_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error joining to the multicast group\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+ iov.iov_base = (void *)buffer;
+ iov.iov_len = length;
+
+ len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+ LeaveFunction(7);
+ return len;
+}
+
+static int
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+ int msize;
+ int ret;
+
+ msize = ntohs(msg->size);
+
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+ struct msghdr msg = {NULL,};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+
+ /* Receive a packet */
+ iov.iov_base = buffer;
+ iov.iov_len = (size_t)buflen;
+
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
+
+ if (len < 0)
+ return len;
+
+ LeaveFunction(7);
+ return len;
+}
+
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
+
+static int sync_thread_master(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
+ struct ip_vs_sync_buff *sb;
+
+ pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
+ }
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ /* (Ab)use interruptible sleep to avoid increasing
+ * the load avg.
+ */
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop());
+ if (unlikely(kthread_should_stop()))
+ goto done;
+ }
+ ip_vs_sync_buff_release(sb);
+ }
+
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* clean up the sync_buff queue */
+ while ((sb = sb_dequeue(ipvs, ms)))
+ ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
+
+ /* clean up the current sync_buff */
+ sb = get_curr_sync_buff(ipvs, ms, 0);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ int len;
+
+ pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
+ !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+ || kthread_should_stop());
+
+ /* do we have data now? */
+ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ len = ip_vs_receive(tinfo->sock, tinfo->buf,
+ ipvs->recv_mesg_maxlen);
+ if (len <= 0) {
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
+ break;
+ }
+
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
+ }
+ }
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+{
+ struct ip_vs_sync_thread_data *tinfo;
+ struct task_struct **array = NULL, *task;
+ struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ char *name;
+ int (*threadfn)(void *data);
+ int id, count;
+ int result = -ENOMEM;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+ sizeof(struct ip_vs_sync_conn_v0));
+
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (ipvs->ms)
+ return -EEXIST;
+
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ name = "ipvs-m:%d:%d";
+ threadfn = sync_thread_master;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (ipvs->backup_threads)
+ return -EEXIST;
+
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ name = "ipvs-b:%d:%d";
+ threadfn = sync_thread_backup;
+ } else {
+ return -EINVAL;
+ }
+
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
+
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
+ }
+ set_sync_mesg_maxlen(net, state);
+
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ } else {
+ tinfo->buf = NULL;
+ }
+ tinfo->id = id;
+
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
+ }
+
+ /* mark as active */
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ return 0;
+
+outsocket:
+ sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
+out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
+ return result;
+}
+
+
+int stop_sync_thread(struct net *net, int state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
+ int retc = -EINVAL;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (!ipvs->ms)
+ return -ESRCH;
+
+ /*
+ * The lock synchronizes with sb_queue_tail(), so that we don't
+ * add sync buffers to the queue, when we are already in
+ * progress of stopping the master sync daemon.
+ */
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (!ipvs->backup_threads)
+ return -ESRCH;
+
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
+ }
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return retc;
+}
+
+/*
+ * Initialize data struct for each netns
+ */
+int __net_init ip_vs_sync_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+ return 0;
+}
+
+void ip_vs_sync_net_cleanup(struct net *net)
+{
+ int retc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Master Daemon\n");
+
+ retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Backup Daemon\n");
+ mutex_unlock(&ipvs->sync_mutex);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
new file mode 100644
index 00000000000..b5b4650d50a
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,115 @@
+/*
+ * IPVS: Weighted Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
+ * Wensong Zhang : changed to use the inactconns in scheduling
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wlc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+ .name = "wlc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+ .schedule = ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
new file mode 100644
index 00000000000..0546cd572d6
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,270 @@
+/*
+ * IPVS: Weighted Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wrr_update_svc
+ * Julian Anastasov : fixed the bug of returning destination
+ * with weight 0 when all weights are zero
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/gcd.h>
+
+#include <net/ip_vs.h>
+
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+ struct ip_vs_dest *cl; /* current dest or head */
+ int cw; /* current weight */
+ int mw; /* maximum weight */
+ int di; /* decreasing interval */
+ struct rcu_head rcu_head;
+};
+
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g ? g : 1;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark;
+
+ /*
+ * Allocate the mark variable for WRR scheduling
+ */
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
+ if (mark == NULL)
+ return -ENOMEM;
+
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
+ svc->sched_data = mark;
+
+ return 0;
+}
+
+
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ /*
+ * Release the mark variable
+ */
+ kfree_rcu(mark, rcu_head);
+}
+
+
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *last, *stop = NULL;
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+ bool last_pass = false, restarted = false;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
+ while (1) {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
+ }
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
+ }
+ }
+
+found:
+ IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt),
+ atomic_read(&dest->weight));
+ mark->cl = dest;
+
+ out:
+ spin_unlock_bh(&svc->sched_lock);
+ return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+ .name = "wrr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+ .init_service = ip_vs_wrr_init_svc,
+ .done_service = ip_vs_wrr_done_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
+ .schedule = ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
new file mode 100644
index 00000000000..73ba1cc7a88
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1266 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h> /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
+ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
+};
+
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
+/*
+ * Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
+
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+}
+
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
+
+ if (!dest_dst)
+ return NULL;
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
+ return NULL;
+ return dest_dst;
+}
+
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
+ }
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
+}
+
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+ FLOWI_FLAG_KNOWN_NH : 0;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
+}
+
+/* Get route to destination or remote server */
+static int
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rtable *rt; /* Route to the other host */
+ struct rtable *ort; /* Original route */
+ struct iphdr *iph;
+ __be16 df;
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
+ if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ daddr = dest->addr.ip;
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
+ } else {
+ __be32 saddr = htonl(INADDR_ANY);
+
+ noref = 0;
+
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ if (!rt)
+ goto err_unreach;
+ if (ret_saddr)
+ *ret_saddr = saddr;
+ }
+
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+ (rt->rt_flags & RTCF_LOCAL) ?
+ "local":"non-local", &daddr);
+ goto err_put;
+ }
+ iph = ip_hdr(skb);
+ if (likely(!local)) {
+ if (unlikely(ipv4_is_loopback(iph->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI4 to non-local address, dest: %pI4\n",
+ &iph->saddr, &daddr);
+ goto err_put;
+ }
+ } else {
+ ort = skb_rtable(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !(ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+ "local requires NAT method, dest: %pI4\n",
+ &iph->daddr, &daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
+ }
+
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ df = iph->frag_off & htons(IP_DF);
+ } else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
+ }
+ ort = skb_rtable(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ /* MTU check allowed? */
+ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
+ }
+
+ /* MTU checking */
+ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+ return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+ struct in6_addr *ret_saddr, int do_xfrm)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error)
+ goto out_err;
+ if (!ret_saddr)
+ return dst;
+ if (ipv6_addr_any(&fl6.saddr) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6.daddr, 0, &fl6.saddr) < 0)
+ goto out_err;
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ *ret_saddr = fl6.saddr;
+ return dst;
+
+out_err:
+ dst_release(dst);
+ IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+ return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ */
+static int
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct in6_addr *daddr, struct in6_addr *ret_saddr,
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rt6_info *rt; /* Route to the other host */
+ struct rt6_info *ort; /* Original route */
+ struct dst_entry *dst;
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
+ u32 cookie;
+
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+ &dest_dst->dst_saddr.in6,
+ do_xfrm);
+ if (!dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ rt = (struct rt6_info *) dst;
+ cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.in6;
+ } else {
+ noref = 0;
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ if (!dst)
+ goto err_unreach;
+ rt = (struct rt6_info *) dst;
+ }
+
+ local = __ip_vs_is_local_route6(rt);
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
+ local ? "local":"non-local", daddr);
+ goto err_put;
+ }
+ if (likely(!local)) {
+ if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI6c to non-local address, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err_put;
+ }
+ } else {
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !__ip_vs_is_local_route6(ort)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+ "to local requires NAT method, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
+ }
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ }
+
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+#endif
+
+
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
+{
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+
+/*
+ * NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ /* we do not touch skb and do not need pskb ptr */
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+}
+
+
+/*
+ * Bypass transmitter
+ * Let packets bypass the destination when the destination is not
+ * available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL) < 0)
+ goto tx_error;
+
+ ip_send_check(iph);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
+ goto tx_error;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+/*
+ * NAT transmitter (only for outside-to-inside nat forwarding)
+ * Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rtable *rt; /* Route to the other host */
+ int local, rc, was_input;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ __be16 _pt, *p;
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ip_hdr(skb)->daddr = cp->daddr.ip;
+ ip_send_check(ip_hdr(skb));
+
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int local, rc;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
+ __be16 _pt, *p;
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+tx_error:
+ LeaveFunction(10);
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * IP Tunneling transmitter
+ *
+ * This function encapsulates the packet in a new IP packet, its
+ * destination will be set to cp->daddr. Most code of this function
+ * is taken from ipip.c.
+ *
+ * It is used in VS/TUN cluster. The load balancer selects a real
+ * server from a cluster based on a scheduling algorithm,
+ * encapsulates the request packet and forwards it to the selected
+ * server. For example, all real servers are configured with
+ * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ * the encapsulated packet, it will decapsulate the packet, processe
+ * the request and return the response packets directly to the client
+ * without passing the load balancer. This can greatly increase the
+ * scalability of virtual server.
+ *
+ * Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ struct rtable *rt; /* Route to the other host */
+ __be32 saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *old_iph = ip_hdr(skb);
+ u8 tos = old_iph->tos;
+ __be16 df;
+ struct iphdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ rt = skb_rtable(skb);
+ tdev = rt->dst.dev;
+
+ /* Copy DF, reset fragment offset and MF */
+ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
+ struct sk_buff *new_skb =
+ skb_realloc_headroom(skb, max_headroom);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
+ skb = new_skb;
+ old_iph = ip_hdr(skb);
+ }
+
+ skb->transport_header = skb->network_header;
+
+ /* fix old IP header checksum */
+ ip_send_check(old_iph);
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_IPIP;
+ iph->tos = tos;
+ iph->daddr = cp->daddr.ip;
+ iph->saddr = saddr;
+ iph->ttl = old_iph->ttl;
+ ip_select_ident(skb, NULL);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ struct in6_addr saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ struct ipv6hdr *old_iph = ipv6_hdr(skb);
+ struct ipv6hdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ rt = (struct rt6_info *) skb_dst(skb);
+ tdev = rt->dst.dev;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
+ struct sk_buff *new_skb =
+ skb_realloc_headroom(skb, max_headroom);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
+ skb = new_skb;
+ old_iph = ipv6_hdr(skb);
+ }
+
+ skb->transport_header = skb->network_header;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ipv6_hdr(skb);
+ iph->version = 6;
+ iph->nexthdr = IPPROTO_IPV6;
+ iph->payload_len = old_iph->payload_len;
+ be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
+ iph->priority = old_iph->priority;
+ memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+ iph->daddr = cp->daddr.in6;
+ iph->saddr = saddr;
+ iph->hop_limit = old_iph->hop_limit;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * Direct Routing transmitter
+ * Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ ip_send_check(ip_hdr(skb));
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * ICMP packet transmitter
+ * called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
+{
+ struct rtable *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode, was_input;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, iph);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+ was_input = rt_is_input_route(skb_rtable(skb));
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+ out:
+ LeaveFunction(10);
+ return rc;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+out:
+ LeaveFunction(10);
+ return rc;
+}
+#endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
new file mode 100644
index 00000000000..a4b5e2a435a
--- /dev/null
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -0,0 +1,133 @@
+/* Accouting handling for netfilter. */
+
+/*
+ * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+
+static bool nf_ct_acct __read_mostly;
+
+module_param_named(acct, nf_ct_acct, bool, 0644);
+MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table acct_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_acct",
+ .data = &init_net.ct.sysctl_acct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+unsigned int
+seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
+{
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
+
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
+ return 0;
+
+ counter = acct->counter;
+ return seq_printf(s, "packets=%llu bytes=%llu ",
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
+};
+EXPORT_SYMBOL_GPL(seq_print_acct);
+
+static struct nf_ct_ext_type acct_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_acct),
+ .align = __alignof__(struct nf_conn_acct),
+ .id = NF_CT_EXT_ACCT,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_acct;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
+ if (!net->ct.acct_sysctl_header) {
+ printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.acct_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.acct_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_acct_pernet_init(struct net *net)
+{
+ net->ct.sysctl_acct = nf_ct_acct;
+ return nf_conntrack_acct_init_sysctl(net);
+}
+
+void nf_conntrack_acct_pernet_fini(struct net *net)
+{
+ nf_conntrack_acct_fini_sysctl(net);
+}
+
+int nf_conntrack_acct_init(void)
+{
+ int ret = nf_ct_extend_register(&acct_extend);
+ if (ret < 0)
+ pr_err("nf_conntrack_acct: Unable to register extension\n");
+ return ret;
+}
+
+void nf_conntrack_acct_fini(void)
+{
+ nf_ct_extend_unregister(&acct_extend);
+}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
new file mode 100644
index 00000000000..b8b95f4027c
--- /dev/null
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -0,0 +1,241 @@
+/* Amanda extension for IP connection tracking
+ *
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ * (C) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/textsearch.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/gfp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+static unsigned int master_timeout __read_mostly = 300;
+static char *ts_algo = "kmp";
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_amanda");
+MODULE_ALIAS_NFCT_HELPER("amanda");
+
+module_param(master_timeout, uint, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+module_param(ts_algo, charp, 0400);
+MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
+
+unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+ __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_amanda_hook);
+
+enum amanda_strings {
+ SEARCH_CONNECT,
+ SEARCH_NEWLINE,
+ SEARCH_DATA,
+ SEARCH_MESG,
+ SEARCH_INDEX,
+};
+
+static struct {
+ const char *string;
+ size_t len;
+ struct ts_config *ts;
+} search[] __read_mostly = {
+ [SEARCH_CONNECT] = {
+ .string = "CONNECT ",
+ .len = 8,
+ },
+ [SEARCH_NEWLINE] = {
+ .string = "\n",
+ .len = 1,
+ },
+ [SEARCH_DATA] = {
+ .string = "DATA ",
+ .len = 5,
+ },
+ [SEARCH_MESG] = {
+ .string = "MESG ",
+ .len = 5,
+ },
+ [SEARCH_INDEX] = {
+ .string = "INDEX ",
+ .len = 6,
+ },
+};
+
+static int amanda_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct ts_state ts;
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ unsigned int dataoff, start, stop, off, i;
+ char pbuf[sizeof("65535")], *tmp;
+ u_int16_t len;
+ __be16 port;
+ int ret = NF_ACCEPT;
+ typeof(nf_nat_amanda_hook) nf_nat_amanda;
+
+ /* Only look at packets from the Amanda server */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* increase the UDP timeout of the master connection as replies from
+ * Amanda clients to the server can be quite delayed */
+ nf_ct_refresh(ct, skb, master_timeout * HZ);
+
+ /* No data? */
+ dataoff = protoff + sizeof(struct udphdr);
+ if (dataoff >= skb->len) {
+ net_err_ratelimited("amanda_help: skblen = %u\n", skb->len);
+ return NF_ACCEPT;
+ }
+
+ memset(&ts, 0, sizeof(ts));
+ start = skb_find_text(skb, dataoff, skb->len,
+ search[SEARCH_CONNECT].ts, &ts);
+ if (start == UINT_MAX)
+ goto out;
+ start += dataoff + search[SEARCH_CONNECT].len;
+
+ memset(&ts, 0, sizeof(ts));
+ stop = skb_find_text(skb, start, skb->len,
+ search[SEARCH_NEWLINE].ts, &ts);
+ if (stop == UINT_MAX)
+ goto out;
+ stop += start;
+
+ for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
+ memset(&ts, 0, sizeof(ts));
+ off = skb_find_text(skb, start, stop, search[i].ts, &ts);
+ if (off == UINT_MAX)
+ continue;
+ off += start + search[i].len;
+
+ len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
+ if (skb_copy_bits(skb, off, pbuf, len))
+ break;
+ pbuf[len] = '\0';
+
+ port = htons(simple_strtoul(pbuf, &tmp, 10));
+ len = tmp - pbuf;
+ if (port == 0 || len > 5)
+ break;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &port);
+
+ nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
+ if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_amanda(skb, ctinfo, protoff,
+ off - dataoff, len, exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ }
+
+out:
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy amanda_exp_policy = {
+ .max_expected = 3,
+ .timeout = 180,
+};
+
+static struct nf_conntrack_helper amanda_helper[2] __read_mostly = {
+ {
+ .name = "amanda",
+ .me = THIS_MODULE,
+ .help = amanda_help,
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(10080),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .expect_policy = &amanda_exp_policy,
+ },
+ {
+ .name = "amanda",
+ .me = THIS_MODULE,
+ .help = amanda_help,
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.udp.port = cpu_to_be16(10080),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .expect_policy = &amanda_exp_policy,
+ },
+};
+
+static void __exit nf_conntrack_amanda_fini(void)
+{
+ int i;
+
+ nf_conntrack_helper_unregister(&amanda_helper[0]);
+ nf_conntrack_helper_unregister(&amanda_helper[1]);
+ for (i = 0; i < ARRAY_SIZE(search); i++)
+ textsearch_destroy(search[i].ts);
+}
+
+static int __init nf_conntrack_amanda_init(void)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(search); i++) {
+ search[i].ts = textsearch_prepare(ts_algo, search[i].string,
+ search[i].len,
+ GFP_KERNEL, TS_AUTOLOAD);
+ if (IS_ERR(search[i].ts)) {
+ ret = PTR_ERR(search[i].ts);
+ goto err1;
+ }
+ }
+ ret = nf_conntrack_helper_register(&amanda_helper[0]);
+ if (ret < 0)
+ goto err1;
+ ret = nf_conntrack_helper_register(&amanda_helper[1]);
+ if (ret < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_conntrack_helper_unregister(&amanda_helper[0]);
+err1:
+ while (--i >= 0)
+ textsearch_destroy(search[i].ts);
+
+ return ret;
+}
+
+module_init(nf_conntrack_amanda_init);
+module_exit(nf_conntrack_amanda_fini);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 00000000000..4e99cca6161
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ * broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int timeout)
+{
+ struct nf_conntrack_expect *exp;
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct nf_conn_help *help = nfct_help(ct);
+ __be32 mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if (skb->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ exp->mask.src.u3.ip = mask;
+ exp->mask.src.u.udp.port = htons(0xFFFF);
+
+ exp->expectfn = NULL;
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+ exp->helper = NULL;
+
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+
+ nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d622ddf08bb..1f4f954c4b4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -3,31 +3,19 @@
extension. */
/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
- * - new API and handling of conntrack/nat helpers
- * - now capable of multiple expectations for one master
- * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
- * - add usage/reference counts to ip_conntrack_expect
- * - export ip_conntrack[_expect]_{find_get,put} functions
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - generalize L3 protocol denendent part.
- * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - add support various size of conntrack structures.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_core.c
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
+#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
@@ -42,354 +30,139 @@
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
-
-/* This rwlock protects the main hash table, protocol/helper/expected
- registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <linux/netfilter_ipv4/listhelp.h>
-
-#define NF_CONNTRACK_VERSION "0.4.1"
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-DEFINE_RWLOCK(nf_conntrack_lock);
-
-/* nf_conntrack_standalone needs this */
-atomic_t nf_conntrack_count = ATOMIC_INIT(0);
-
-void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
-LIST_HEAD(nf_conntrack_expect_list);
-struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
-struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
-static LIST_HEAD(helpers);
-unsigned int nf_conntrack_htable_size = 0;
-int nf_conntrack_max;
-struct list_head *nf_conntrack_hash;
-static kmem_cache_t *nf_conntrack_expect_cachep;
-struct nf_conn nf_conntrack_untracked;
-unsigned int nf_ct_log_invalid;
-static LIST_HEAD(unconfirmed);
-static int nf_conntrack_vmalloc;
-
-static unsigned int nf_conntrack_next_id = 1;
-static unsigned int nf_conntrack_expect_next_id = 1;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct notifier_block *nf_conntrack_chain;
-struct notifier_block *nf_conntrack_expect_chain;
-
-DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
-
-/* deliver cached events and clear cache entry - must be called with locally
- * disabled softirqs */
-static inline void
-__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
-{
- DEBUGP("ecache: delivering events for %p\n", ecache->ct);
- if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
- && ecache->events)
- notifier_call_chain(&nf_conntrack_chain, ecache->events,
- ecache->ct);
-
- ecache->events = 0;
- nf_ct_put(ecache->ct);
- ecache->ct = NULL;
-}
-
-/* Deliver all cached events for a particular conntrack. This is called
- * by code prior to async packet handling for freeing the skb */
-void nf_ct_deliver_cached_events(const struct nf_conn *ct)
-{
- struct nf_conntrack_ecache *ecache;
-
- local_bh_disable();
- ecache = &__get_cpu_var(nf_conntrack_ecache);
- if (ecache->ct == ct)
- __nf_ct_deliver_cached_events(ecache);
- local_bh_enable();
-}
-
-/* Deliver cached events for old pending events, if current conntrack != old */
-void __nf_ct_event_cache_init(struct nf_conn *ct)
-{
- struct nf_conntrack_ecache *ecache;
-
- /* take care of delivering potentially old events */
- ecache = &__get_cpu_var(nf_conntrack_ecache);
- BUG_ON(ecache->ct == ct);
- if (ecache->ct)
- __nf_ct_deliver_cached_events(ecache);
- /* initialize for this conntrack/packet */
- ecache->ct = ct;
- nf_conntrack_get(&ct->ct_general);
-}
-
-/* flush the event cache - touches other CPU's data and must not be called
- * while packets are still passing through the code */
-static void nf_ct_event_cache_flush(void)
-{
- struct nf_conntrack_ecache *ecache;
- int cpu;
-
- for_each_cpu(cpu) {
- ecache = &per_cpu(nf_conntrack_ecache, cpu);
- if (ecache->ct)
- nf_ct_put(ecache->ct);
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+#define NF_CONNTRACK_VERSION "0.5.0"
+
+int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr) __read_mostly;
+EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
+
+__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+EXPORT_SYMBOL_GPL(nf_conntrack_locks);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+
+static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ spin_unlock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_unlock(&nf_conntrack_locks[h2]);
+}
+
+/* return true if we need to recompute hashes (in case hash table was resized) */
+static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
+ unsigned int h2, unsigned int sequence)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ if (h1 <= h2) {
+ spin_lock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_lock_nested(&nf_conntrack_locks[h2],
+ SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&nf_conntrack_locks[h2]);
+ spin_lock_nested(&nf_conntrack_locks[h1],
+ SINGLE_DEPTH_NESTING);
+ }
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ nf_conntrack_double_unlock(h1, h2);
+ return true;
}
+ return false;
}
-#else
-static inline void nf_ct_event_cache_flush(void) {}
-#endif /* CONFIG_NF_CONNTRACK_EVENTS */
-
-DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
-EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
-
-/*
- * This scheme offers various size of "struct nf_conn" dependent on
- * features(helper, nat, ...)
- */
-
-#define NF_CT_FEATURES_NAMELEN 256
-static struct {
- /* name of slab cache. printed in /proc/slabinfo */
- char *name;
-
- /* size of slab cache */
- size_t size;
-
- /* slab cache pointer */
- kmem_cache_t *cachep;
-
- /* allocated slab cache + modules which uses this slab cache */
- int use;
-
- /* Initialization */
- int (*init_conntrack)(struct nf_conn *, u_int32_t);
-
-} nf_ct_cache[NF_CT_F_NUM];
-/* protect members of nf_ct_cache except of "use" */
-DEFINE_RWLOCK(nf_ct_cache_lock);
-
-/* This avoids calling kmem_cache_create() with same name simultaneously */
-DECLARE_MUTEX(nf_ct_cache_mutex);
-
-extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
-struct nf_conntrack_protocol *
-__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
+static void nf_conntrack_all_lock(void)
{
- if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
- return &nf_conntrack_generic_protocol;
+ int i;
- return nf_ct_protos[l3proto][protocol];
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_nested(&nf_conntrack_locks[i], i);
}
-/* this is guaranteed to always return a valid protocol helper, since
- * it falls back to generic_protocol */
-struct nf_conntrack_protocol *
-nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
+static void nf_conntrack_all_unlock(void)
{
- struct nf_conntrack_protocol *p;
+ int i;
- preempt_disable();
- p = __nf_ct_proto_find(l3proto, protocol);
- if (p) {
- if (!try_module_get(p->me))
- p = &nf_conntrack_generic_protocol;
- }
- preempt_enable();
-
- return p;
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_unlock(&nf_conntrack_locks[i]);
}
-void nf_ct_proto_put(struct nf_conntrack_protocol *p)
-{
- module_put(p->me);
-}
+unsigned int nf_conntrack_htable_size __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
-struct nf_conntrack_l3proto *
-nf_ct_l3proto_find_get(u_int16_t l3proto)
-{
- struct nf_conntrack_l3proto *p;
+unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
- preempt_disable();
- p = __nf_ct_l3proto_find(l3proto);
- if (p) {
- if (!try_module_get(p->me))
- p = &nf_conntrack_generic_l3proto;
- }
- preempt_enable();
+DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
- return p;
-}
-
-void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
-{
- module_put(p->me);
-}
+unsigned int nf_conntrack_hash_rnd __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
-
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
{
- unsigned int a, b;
- a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
- ((tuple->src.l3num) << 16) | tuple->dst.protonum);
- b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
- (tuple->src.u.all << 16) | tuple->dst.u.all);
+ unsigned int n;
- return jhash_2words(a, b, rnd) % size;
+ /* The direction must be ignored, so we hash everything up to the
+ * destination ports (which is a multiple of 4) and treat the last
+ * three bytes manually.
+ */
+ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
+ return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ (((__force __u16)tuple->dst.u.all << 16) |
+ tuple->dst.protonum));
}
-static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
+static u32 __hash_bucket(u32 hash, unsigned int size)
{
- return __hash_conntrack(tuple, nf_conntrack_htable_size,
- nf_conntrack_hash_rnd);
+ return ((u64)hash * size) >> 32;
}
-/* Initialize "struct nf_conn" which has spaces for helper */
-static int
-init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
+static u32 hash_bucket(u32 hash, const struct net *net)
{
-
- conntrack->help = (union nf_conntrack_help *)
- (((unsigned long)conntrack->data
- + (__alignof__(union nf_conntrack_help) - 1))
- & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
- return 0;
+ return __hash_bucket(hash, net->ct.htable_size);
}
-int nf_conntrack_register_cache(u_int32_t features, const char *name,
- size_t size,
- int (*init)(struct nf_conn *, u_int32_t))
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
{
- int ret = 0;
- char *cache_name;
- kmem_cache_t *cachep;
-
- DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
- features, name, size);
-
- if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
- DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
- features);
- return -EINVAL;
- }
-
- down(&nf_ct_cache_mutex);
-
- write_lock_bh(&nf_ct_cache_lock);
- /* e.g: multiple helpers are loaded */
- if (nf_ct_cache[features].use > 0) {
- DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
- if ((!strncmp(nf_ct_cache[features].name, name,
- NF_CT_FEATURES_NAMELEN))
- && nf_ct_cache[features].size == size
- && nf_ct_cache[features].init_conntrack == init) {
- DEBUGP("nf_conntrack_register_cache: reusing.\n");
- nf_ct_cache[features].use++;
- ret = 0;
- } else
- ret = -EBUSY;
-
- write_unlock_bh(&nf_ct_cache_lock);
- up(&nf_ct_cache_mutex);
- return ret;
- }
- write_unlock_bh(&nf_ct_cache_lock);
-
- /*
- * The memory space for name of slab cache must be alive until
- * cache is destroyed.
- */
- cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
- if (cache_name == NULL) {
- DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
- ret = -ENOMEM;
- goto out_up_mutex;
- }
-
- if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
- >= NF_CT_FEATURES_NAMELEN) {
- printk("nf_conntrack_register_cache: name too long\n");
- ret = -EINVAL;
- goto out_free_name;
- }
-
- cachep = kmem_cache_create(cache_name, size, 0, 0,
- NULL, NULL);
- if (!cachep) {
- printk("nf_conntrack_register_cache: Can't create slab cache "
- "for the features = 0x%x\n", features);
- ret = -ENOMEM;
- goto out_free_name;
- }
-
- write_lock_bh(&nf_ct_cache_lock);
- nf_ct_cache[features].use = 1;
- nf_ct_cache[features].size = size;
- nf_ct_cache[features].init_conntrack = init;
- nf_ct_cache[features].cachep = cachep;
- nf_ct_cache[features].name = cache_name;
- write_unlock_bh(&nf_ct_cache_lock);
-
- goto out_up_mutex;
-
-out_free_name:
- kfree(cache_name);
-out_up_mutex:
- up(&nf_ct_cache_mutex);
- return ret;
+ return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
}
-/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
-void nf_conntrack_unregister_cache(u_int32_t features)
+static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
{
- kmem_cache_t *cachep;
- char *name;
-
- /*
- * This assures that kmem_cache_create() isn't called before destroying
- * slab cache.
- */
- DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
- down(&nf_ct_cache_mutex);
-
- write_lock_bh(&nf_ct_cache_lock);
- if (--nf_ct_cache[features].use > 0) {
- write_unlock_bh(&nf_ct_cache_lock);
- up(&nf_ct_cache_mutex);
- return;
- }
- cachep = nf_ct_cache[features].cachep;
- name = nf_ct_cache[features].name;
- nf_ct_cache[features].cachep = NULL;
- nf_ct_cache[features].name = NULL;
- nf_ct_cache[features].init_conntrack = NULL;
- nf_ct_cache[features].size = 0;
- write_unlock_bh(&nf_ct_cache_lock);
-
- synchronize_net();
-
- kmem_cache_destroy(cachep);
- kfree(name);
-
- up(&nf_ct_cache_mutex);
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
}
-int
+bool
nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int nhoff,
unsigned int dataoff,
@@ -397,288 +170,444 @@ nf_ct_get_tuple(const struct sk_buff *skb,
u_int8_t protonum,
struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
- const struct nf_conntrack_protocol *protocol)
+ const struct nf_conntrack_l4proto *l4proto)
{
- NF_CT_TUPLE_U_BLANK(tuple);
+ memset(tuple, 0, sizeof(*tuple));
tuple->src.l3num = l3num;
if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
- return 0;
+ return false;
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- return protocol->pkt_to_tuple(skb, dataoff, tuple);
+ return l4proto->pkt_to_tuple(skb, dataoff, tuple);
}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
-int
+bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
+ u_int16_t l3num, struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ unsigned int protoff;
+ u_int8_t protonum;
+ int ret;
+
+ rcu_read_lock();
+
+ l3proto = __nf_ct_l3proto_find(l3num);
+ ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
+ if (ret != NF_ACCEPT) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ l4proto = __nf_ct_l4proto_find(l3num, protonum);
+
+ ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
+ l3proto, l4proto);
+
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
+
+bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_l3proto *l3proto,
- const struct nf_conntrack_protocol *protocol)
+ const struct nf_conntrack_l4proto *l4proto)
{
- NF_CT_TUPLE_U_BLANK(inverse);
+ memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
if (l3proto->invert_tuple(inverse, orig) == 0)
- return 0;
+ return false;
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
- return protocol->invert_tuple(inverse, orig);
-}
-
-/* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
-{
- ASSERT_WRITE_LOCK(&nf_conntrack_lock);
- NF_CT_ASSERT(!timer_pending(&exp->timeout));
- list_del(&exp->list);
- NF_CT_STAT_INC(expect_delete);
- exp->master->expecting--;
- nf_conntrack_expect_put(exp);
+ return l4proto->invert_tuple(inverse, orig);
}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
-static void expectation_timed_out(unsigned long ul_expect)
+static void
+clean_from_lists(struct nf_conn *ct)
{
- struct nf_conntrack_expect *exp = (void *)ul_expect;
+ pr_debug("clean_from_lists(%p)\n", ct);
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
- write_lock_bh(&nf_conntrack_lock);
- nf_ct_unlink_expect(exp);
- write_unlock_bh(&nf_conntrack_lock);
- nf_conntrack_expect_put(exp);
+ /* Destroy all pending expectations */
+ nf_ct_remove_expectations(ct);
}
-struct nf_conntrack_expect *
-__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
{
- struct nf_conntrack_expect *i;
-
- list_for_each_entry(i, &nf_conntrack_expect_list, list) {
- if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
- atomic_inc(&i->use);
- return i;
- }
- }
- return NULL;
-}
+ struct ct_pcpu *pcpu;
-/* Just find a expectation corresponding to a tuple. */
-struct nf_conntrack_expect *
-nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_expect *i;
-
- read_lock_bh(&nf_conntrack_lock);
- i = __nf_conntrack_expect_find(tuple);
- read_unlock_bh(&nf_conntrack_lock);
-
- return i;
-}
-
-/* If an expectation for this connection is found, it gets delete from
- * global list then returned. */
-static struct nf_conntrack_expect *
-find_expectation(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_expect *i;
-
- list_for_each_entry(i, &nf_conntrack_expect_list, list) {
- /* If master is not in hash table yet (ie. packet hasn't left
- this machine yet), how can other end know about expected?
- Hence these are not the droids you are looking for (if
- master ct never got confirmed, we'd hold a reference to it
- and weird things would happen to future packets). */
- if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && nf_ct_is_confirmed(i->master)) {
- if (i->flags & NF_CT_EXPECT_PERMANENT) {
- atomic_inc(&i->use);
- return i;
- } else if (del_timer(&i->timeout)) {
- nf_ct_unlink_expect(i);
- return i;
- }
- }
- }
- return NULL;
+ /* add this conntrack to the (per cpu) dying list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->dying);
+ spin_unlock(&pcpu->lock);
}
-/* delete all expectations for this conntrack */
-void nf_ct_remove_expectations(struct nf_conn *ct)
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
{
- struct nf_conntrack_expect *i, *tmp;
+ struct ct_pcpu *pcpu;
- /* Optimization: most connection never expect any others. */
- if (ct->expecting == 0)
- return;
+ /* add this conntrack to the (per cpu) unconfirmed list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
- list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
- if (i->master == ct && del_timer(&i->timeout)) {
- nf_ct_unlink_expect(i);
- nf_conntrack_expect_put(i);
- }
- }
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->unconfirmed);
+ spin_unlock(&pcpu->lock);
}
-static void
-clean_from_lists(struct nf_conn *ct)
+/* must be called with local_bh_disable */
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
{
- unsigned int ho, hr;
-
- DEBUGP("clean_from_lists(%p)\n", ct);
- ASSERT_WRITE_LOCK(&nf_conntrack_lock);
+ struct ct_pcpu *pcpu;
- ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* We overload first tuple to link into unconfirmed or dying list.*/
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
- /* Destroy all pending expectations */
- nf_ct_remove_expectations(ct);
+ spin_lock(&pcpu->lock);
+ BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&pcpu->lock);
}
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- struct nf_conntrack_l3proto *l3proto;
- struct nf_conntrack_protocol *proto;
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_l4proto *l4proto;
- DEBUGP("destroy_conntrack(%p)\n", ct);
+ pr_debug("destroy_conntrack(%p)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
- nf_conntrack_event(IPCT_DESTROY, ct);
- set_bit(IPS_DYING_BIT, &ct->status);
-
- /* To make sure we don't get any weird locking issues here:
- * destroy_conntrack() MUST NOT be called with a write lock
- * to nf_conntrack_lock!!! -HW */
- l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
- if (l3proto && l3proto->destroy)
- l3proto->destroy(ct);
-
- proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
- if (proto && proto->destroy)
- proto->destroy(ct);
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto && l4proto->destroy)
+ l4proto->destroy(ct);
- if (nf_conntrack_destroyed)
- nf_conntrack_destroyed(ct);
+ rcu_read_unlock();
- write_lock_bh(&nf_conntrack_lock);
+ local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
- * too. */
+ * too.
+ */
nf_ct_remove_expectations(ct);
- /* We overload first tuple to link into unconfirmed list. */
- if (!nf_ct_is_confirmed(ct)) {
- BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- }
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
- NF_CT_STAT_INC(delete);
- write_unlock_bh(&nf_conntrack_lock);
+ NF_CT_STAT_INC(net, delete);
+ local_bh_enable();
if (ct->master)
nf_ct_put(ct->master);
- DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+ pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
nf_conntrack_free(ct);
}
-static void death_by_timeout(unsigned long ul_conntrack)
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
- struct nf_conn *ct = (void *)ul_conntrack;
+ struct net *net = nf_ct_net(ct);
+ unsigned int hash, reply_hash;
+ u16 zone = nf_ct_zone(ct);
+ unsigned int sequence;
+
+ nf_ct_helper_destroy(ct);
+
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
- write_lock_bh(&nf_conntrack_lock);
- /* Inside lock so preempt is disabled on module removal path.
- * Otherwise we can get spurious warnings. */
- NF_CT_STAT_INC(delete_list);
clean_from_lists(ct);
- write_unlock_bh(&nf_conntrack_lock);
+ nf_conntrack_double_unlock(hash, reply_hash);
+
+ nf_ct_add_to_dying_list(ct);
+
+ NF_CT_STAT_INC(net, delete_list);
+ local_bh_enable();
+}
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
+
+ BUG_ON(ecache == NULL);
+
+ if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+ /* bad luck, let's retry again */
+ ecache->timeout.expires = jiffies +
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ecache->timeout);
+ return;
+ }
+ /* we've got the event delivered, now it's dying */
+ set_bit(IPS_DYING_BIT, &ct->status);
nf_ct_put(ct);
}
-static inline int
-conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conn *ignored_conntrack)
+static void nf_ct_dying_timeout(struct nf_conn *ct)
{
- ASSERT_READ_LOCK(&nf_conntrack_lock);
- return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
- && nf_ct_tuple_equal(tuple, &i->tuple);
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
+
+ BUG_ON(ecache == NULL);
+
+ /* set a new timer to retry event delivery */
+ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct);
+ ecache->timeout.expires = jiffies +
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ecache->timeout);
}
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
- const struct nf_conn *ignored_conntrack)
+bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
+{
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
+
+ if (!nf_ct_is_dying(ct) &&
+ unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct,
+ portid, report) < 0)) {
+ /* destroy event was not delivered */
+ nf_ct_delete_from_lists(ct);
+ nf_ct_dying_timeout(ct);
+ return false;
+ }
+ set_bit(IPS_DYING_BIT, &ct->status);
+ nf_ct_delete_from_lists(ct);
+ nf_ct_put(ct);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
+}
+
+static inline bool
+nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
+ const struct nf_conntrack_tuple *tuple,
+ u16 zone)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is confirmed
+ */
+ return nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone &&
+ nf_ct_is_confirmed(ct);
+}
+
+/*
+ * Warning :
+ * - Caller must take a reference on returned object
+ * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
+ */
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
- unsigned int hash = hash_conntrack(tuple);
+ struct hlist_nulls_node *n;
+ unsigned int bucket = hash_bucket(hash, net);
- ASSERT_READ_LOCK(&nf_conntrack_lock);
- list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
- if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
- NF_CT_STAT_INC(found);
+ /* Disable BHs the entire time since we normally need to disable them
+ * at least once for the stats anyway.
+ */
+ local_bh_disable();
+begin:
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone)) {
+ NF_CT_STAT_INC(net, found);
+ local_bh_enable();
return h;
}
- NF_CT_STAT_INC(searched);
+ NF_CT_STAT_INC(net, searched);
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(n) != bucket) {
+ NF_CT_STAT_INC(net, search_restart);
+ goto begin;
}
+ local_bh_enable();
return NULL;
}
/* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
- const struct nf_conn *ignored_conntrack)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
- read_lock_bh(&nf_conntrack_lock);
- h = __nf_conntrack_find(tuple, ignored_conntrack);
- if (h)
- atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
- read_unlock_bh(&nf_conntrack_lock);
+ rcu_read_lock();
+begin:
+ h = ____nf_conntrack_find(net, zone, tuple, hash);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (unlikely(nf_ct_is_dying(ct) ||
+ !atomic_inc_not_zero(&ct->ct_general.use)))
+ h = NULL;
+ else {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ nf_ct_put(ct);
+ goto begin;
+ }
+ }
+ }
+ rcu_read_unlock();
return h;
}
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
+
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
- unsigned int repl_hash)
+ unsigned int reply_hash)
+{
+ struct net *net = nf_ct_net(ct);
+
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &net->ct.hash[hash]);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+ &net->ct.hash[reply_hash]);
+}
+
+int
+nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
- ct->id = ++nf_conntrack_next_id;
- list_prepend(&nf_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- list_prepend(&nf_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY].list);
+ struct net *net = nf_ct_net(ct);
+ unsigned int hash, reply_hash;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ u16 zone;
+ unsigned int sequence;
+
+ zone = nf_ct_zone(ct);
+
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
+ /* See if there's one in the list already, including reverse */
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+
+ add_timer(&ct->timeout);
+ smp_wmb();
+ /* The caller holds a reference to this object */
+ atomic_set(&ct->ct_general.use, 2);
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert);
+ local_bh_enable();
+ return 0;
+
+out:
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert_failed);
+ local_bh_enable();
+ return -EEXIST;
}
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-void nf_conntrack_hash_insert(struct nf_conn *ct)
+/* deletion from this larval template list happens via nf_ct_put() */
+void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
{
- unsigned int hash, repl_hash;
+ struct ct_pcpu *pcpu;
- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
- write_lock_bh(&nf_conntrack_lock);
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
- write_unlock_bh(&nf_conntrack_lock);
+ /* add this conntrack to the (per cpu) tmpl list */
+ local_bh_disable();
+ tmpl->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
+
+ spin_lock(&pcpu->lock);
+ /* Overload tuple linked list to put us in template list. */
+ hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->tmpl);
+ spin_unlock_bh(&pcpu->lock);
}
+EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
/* Confirm a connection given skb; places it in hash table */
int
-__nf_conntrack_confirm(struct sk_buff **pskb)
+__nf_conntrack_confirm(struct sk_buff *skb)
{
- unsigned int hash, repl_hash;
+ unsigned int hash, reply_hash;
+ struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
+ struct nf_conn_help *help;
+ struct nf_conn_tstamp *tstamp;
+ struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
+ struct net *net;
+ u16 zone;
+ unsigned int sequence;
- ct = nf_ct_get(*pskb, &ctinfo);
+ ct = nf_ct_get(skb, &ctinfo);
+ net = nf_ct_net(ct);
/* ipt_REJECT uses nf_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
@@ -687,61 +616,98 @@ __nf_conntrack_confirm(struct sk_buff **pskb)
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;
- hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ zone = nf_ct_zone(ct);
+ local_bh_disable();
+
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
- connections for unconfirmed conns. But packet copies and
- REJECT will give spurious warnings here. */
+ * connections for unconfirmed conns. But packet copies and
+ * REJECT will give spurious warnings here.
+ */
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
- /* No external references means noone else could have
- confirmed us. */
+ /* No external references means no one else could have
+ * confirmed us.
+ */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- DEBUGP("Confirming conntrack %p\n", ct);
-
- write_lock_bh(&nf_conntrack_lock);
+ pr_debug("Confirming conntrack %p\n", ct);
+ /* We have to check the DYING flag inside the lock to prevent
+ a race against nf_ct_get_next_corpse() possibly called from
+ user context, else we insert an already 'dead' hash, blocking
+ further use of that particular connection -JM */
+
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_conntrack_double_unlock(hash, reply_hash);
+ local_bh_enable();
+ return NF_ACCEPT;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- if (!LIST_FIND(&nf_conntrack_hash[hash],
- conntrack_tuple_cmp,
- struct nf_conntrack_tuple_hash *,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
- && !LIST_FIND(&nf_conntrack_hash[repl_hash],
- conntrack_tuple_cmp,
- struct nf_conntrack_tuple_hash *,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
- /* Remove from unconfirmed list */
- list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
-
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
- /* Timer relative to confirmation time, not original
- setting time, otherwise we'd get timer wrap in
- weird delay cases. */
- ct->timeout.expires += jiffies;
- add_timer(&ct->timeout);
- atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
- NF_CT_STAT_INC(insert);
- write_unlock_bh(&nf_conntrack_lock);
- if (ct->helper)
- nf_conntrack_event_cache(IPCT_HELPER, *pskb);
-#ifdef CONFIG_NF_NAT_NEEDED
- if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
- test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
- nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
-#endif
- nf_conntrack_event_cache(master_ct(ct) ?
- IPCT_RELATED : IPCT_NEW, *pskb);
- return NF_ACCEPT;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
+
+ /* Timer relative to confirmation time, not original
+ setting time, otherwise we'd get timer wrap in
+ weird delay cases. */
+ ct->timeout.expires += jiffies;
+ add_timer(&ct->timeout);
+ atomic_inc(&ct->ct_general.use);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp(skb);
+
+ tstamp->start = ktime_to_ns(skb->tstamp);
}
+ /* Since the lookup is lockless, hash insertion must be done after
+ * starting the timer and setting the CONFIRMED bit. The RCU barriers
+ * guarantee that no other CPU can find the conntrack before the above
+ * stores are visible.
+ */
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert);
+ local_bh_enable();
- NF_CT_STAT_INC(insert_failed);
- write_unlock_bh(&nf_conntrack_lock);
+ help = nfct_help(ct);
+ if (help && help->helper)
+ nf_conntrack_event_cache(IPCT_HELPER, ct);
+
+ nf_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, ct);
+ return NF_ACCEPT;
+
+out:
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert_failed);
+ local_bh_enable();
return NF_DROP;
}
+EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
/* Returns true if a connection correspondings to the tuple (required
for NAT). */
@@ -749,275 +715,348 @@ int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
{
+ struct net *net = nf_ct_net(ignored_conntrack);
struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *ct;
+ u16 zone = nf_ct_zone(ignored_conntrack);
+ unsigned int hash = hash_conntrack(net, zone, tuple);
- read_lock_bh(&nf_conntrack_lock);
- h = __nf_conntrack_find(tuple, ignored_conntrack);
- read_unlock_bh(&nf_conntrack_lock);
+ /* Disable BHs the entire time since we need to disable them at
+ * least once for the stats anyway.
+ */
+ rcu_read_lock_bh();
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (ct != ignored_conntrack &&
+ nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone) {
+ NF_CT_STAT_INC(net, found);
+ rcu_read_unlock_bh();
+ return 1;
+ }
+ NF_CT_STAT_INC(net, searched);
+ }
+ rcu_read_unlock_bh();
- return h != NULL;
+ return 0;
}
+EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
+
+#define NF_CT_EVICTION_RANGE 8
/* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
-{
- return !(test_bit(IPS_ASSURED_BIT,
- &nf_ct_tuplehash_to_ctrack(i)->status));
-}
-
-static int early_drop(struct list_head *chain)
+static noinline int early_drop(struct net *net, unsigned int _hash)
{
- /* Traverse backwards: gives us oldest, which is roughly LRU */
+ /* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h;
- struct nf_conn *ct = NULL;
+ struct nf_conn *ct = NULL, *tmp;
+ struct hlist_nulls_node *n;
+ unsigned int i = 0, cnt = 0;
int dropped = 0;
+ unsigned int hash, sequence;
+ spinlock_t *lockp;
+
+ local_bh_disable();
+restart:
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_bucket(_hash, net);
+ for (; i < net->ct.htable_size; i++) {
+ lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ spin_unlock(lockp);
+ goto restart;
+ }
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
+ hnnode) {
+ tmp = nf_ct_tuplehash_to_ctrack(h);
+ if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
+ !nf_ct_is_dying(tmp) &&
+ atomic_inc_not_zero(&tmp->ct_general.use)) {
+ ct = tmp;
+ break;
+ }
+ cnt++;
+ }
+
+ hash = (hash + 1) % net->ct.htable_size;
+ spin_unlock(lockp);
+
+ if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ break;
- read_lock_bh(&nf_conntrack_lock);
- h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
- if (h) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- atomic_inc(&ct->ct_general.use);
}
- read_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
if (!ct)
return dropped;
if (del_timer(&ct->timeout)) {
- death_by_timeout((unsigned long)ct);
- dropped = 1;
- NF_CT_STAT_INC(early_drop);
+ if (nf_ct_delete(ct, 0, 0)) {
+ dropped = 1;
+ NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ }
}
nf_ct_put(ct);
return dropped;
}
-static inline int helper_cmp(const struct nf_conntrack_helper *i,
- const struct nf_conntrack_tuple *rtuple)
+void init_nf_conntrack_hash_rnd(void)
{
- return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
-}
-
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
- return LIST_FIND(&helpers, helper_cmp,
- struct nf_conntrack_helper *,
- tuple);
-}
-
-struct nf_conntrack_helper *
-nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_helper *helper;
-
- /* need nf_conntrack_lock to assure that helper exists until
- * try_module_get() is called */
- read_lock_bh(&nf_conntrack_lock);
-
- helper = __nf_ct_helper_find(tuple);
- if (helper) {
- /* need to increase module usage count to assure helper will
- * not go away while the caller is e.g. busy putting a
- * conntrack in the hash that uses the helper */
- if (!try_module_get(helper->me))
- helper = NULL;
- }
-
- read_unlock_bh(&nf_conntrack_lock);
+ unsigned int rand;
- return helper;
-}
-
-void nf_ct_helper_put(struct nf_conntrack_helper *helper)
-{
- module_put(helper->me);
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
}
static struct nf_conn *
-__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
- const struct nf_conntrack_l3proto *l3proto)
+ gfp_t gfp, u32 hash)
{
- struct nf_conn *conntrack = NULL;
- u_int32_t features = 0;
+ struct nf_conn *ct;
- if (!nf_conntrack_hash_rnd_initted) {
- get_random_bytes(&nf_conntrack_hash_rnd, 4);
- nf_conntrack_hash_rnd_initted = 1;
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ init_nf_conntrack_hash_rnd();
+ /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+ hash = hash_conntrack_raw(orig, zone);
}
- if (nf_conntrack_max
- && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
- unsigned int hash = hash_conntrack(orig);
- /* Try dropping from this hash chain. */
- if (!early_drop(&nf_conntrack_hash[hash])) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "nf_conntrack: table full, dropping"
- " packet.\n");
+ /* We don't want any race condition at early drop stage */
+ atomic_inc(&net->ct.count);
+
+ if (nf_conntrack_max &&
+ unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+ if (!early_drop(net, hash)) {
+ atomic_dec(&net->ct.count);
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM);
}
}
- /* find features needed by this conntrack. */
- features = l3proto->get_features(orig);
- read_lock_bh(&nf_conntrack_lock);
- if (__nf_ct_helper_find(repl) != NULL)
- features |= NF_CT_F_HELP;
- read_unlock_bh(&nf_conntrack_lock);
-
- DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
-
- read_lock_bh(&nf_ct_cache_lock);
-
- if (!nf_ct_cache[features].use) {
- DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
- features);
- goto out;
- }
-
- conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
- if (conntrack == NULL) {
- DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
- goto out;
- }
-
- memset(conntrack, 0, nf_ct_cache[features].size);
- conntrack->features = features;
- if (nf_ct_cache[features].init_conntrack &&
- nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
- DEBUGP("nf_conntrack_alloc: failed to init\n");
- kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
- conntrack = NULL;
- goto out;
+ /*
+ * Do not use kmem_cache_zalloc(), as this cache uses
+ * SLAB_DESTROY_BY_RCU.
+ */
+ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ if (ct == NULL) {
+ atomic_dec(&net->ct.count);
+ return ERR_PTR(-ENOMEM);
}
-
- atomic_set(&conntrack->ct_general.use, 1);
- conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+ /*
+ * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
+ * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
+ */
+ memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
+ offsetof(struct nf_conn, proto) -
+ offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+ spin_lock_init(&ct->lock);
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+ /* save hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
/* Don't set timer yet: wait for confirmation */
- init_timer(&conntrack->timeout);
- conntrack->timeout.data = (unsigned long)conntrack;
- conntrack->timeout.function = death_by_timeout;
+ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
+ write_pnet(&ct->ct_net, net);
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (zone) {
+ struct nf_conntrack_zone *nf_ct_zone;
+
+ nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
+ if (!nf_ct_zone)
+ goto out_free;
+ nf_ct_zone->id = zone;
+ }
+#endif
+ /* Because we use RCU lookups, we set ct_general.use to zero before
+ * this is inserted in any list.
+ */
+ atomic_set(&ct->ct_general.use, 0);
+ return ct;
- atomic_inc(&nf_conntrack_count);
-out:
- read_unlock_bh(&nf_ct_cache_lock);
- return conntrack;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+out_free:
+ atomic_dec(&net->ct.count);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ return ERR_PTR(-ENOMEM);
+#endif
}
-struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_tuple *repl)
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
{
- struct nf_conntrack_l3proto *l3proto;
-
- l3proto = __nf_ct_l3proto_find(orig->src.l3num);
- return __nf_conntrack_alloc(orig, repl, l3proto);
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
}
+EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
-void nf_conntrack_free(struct nf_conn *conntrack)
+void nf_conntrack_free(struct nf_conn *ct)
{
- u_int32_t features = conntrack->features;
- NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
- DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
- conntrack);
- kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
- atomic_dec(&nf_conntrack_count);
+ struct net *net = nf_ct_net(ct);
+
+ /* A freed object has refcnt == 0, that's
+ * the golden rule for SLAB_DESTROY_BY_RCU
+ */
+ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
+
+ nf_ct_ext_destroy(ct);
+ nf_ct_ext_free(ct);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ smp_mb__before_atomic();
+ atomic_dec(&net->ct.count);
}
+EXPORT_SYMBOL_GPL(nf_conntrack_free);
+
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
static struct nf_conntrack_tuple_hash *
-init_conntrack(const struct nf_conntrack_tuple *tuple,
+init_conntrack(struct net *net, struct nf_conn *tmpl,
+ const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_l3proto *l3proto,
- struct nf_conntrack_protocol *protocol,
+ struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, u32 hash)
{
- struct nf_conn *conntrack;
+ struct nf_conn *ct;
+ struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
- struct nf_conntrack_expect *exp;
+ struct nf_conntrack_ecache *ecache;
+ struct nf_conntrack_expect *exp = NULL;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ struct nf_conn_timeout *timeout_ext;
+ unsigned int *timeouts;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
- DEBUGP("Can't invert tuple.\n");
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+ pr_debug("Can't invert tuple.\n");
return NULL;
}
- conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
- if (conntrack == NULL || IS_ERR(conntrack)) {
- DEBUGP("Can't allocate conntrack.\n");
- return (struct nf_conntrack_tuple_hash *)conntrack;
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ hash);
+ if (IS_ERR(ct))
+ return (struct nf_conntrack_tuple_hash *)ct;
+
+ if (tmpl && nfct_synproxy(tmpl)) {
+ nfct_seqadj_ext_add(ct);
+ nfct_synproxy_ext_add(ct);
}
- if (!protocol->new(conntrack, skb, dataoff)) {
- nf_conntrack_free(conntrack);
- DEBUGP("init conntrack: can't track with proto module\n");
+ timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
+ if (timeout_ext)
+ timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
+ else
+ timeouts = l4proto->get_timeouts(net);
+
+ if (!l4proto->new(ct, skb, dataoff, timeouts)) {
+ nf_conntrack_free(ct);
+ pr_debug("init conntrack: can't track with proto module\n");
return NULL;
}
- write_lock_bh(&nf_conntrack_lock);
- exp = find_expectation(tuple);
+ if (timeout_ext)
+ nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
+
+ nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
+
+ ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
+ nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC);
+
+ local_bh_disable();
+ if (net->ct.expect_count) {
+ spin_lock(&nf_conntrack_expect_lock);
+ exp = nf_ct_find_expectation(net, zone, tuple);
+ if (exp) {
+ pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ ct, exp);
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
+ /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
+ ct->master = exp->master;
+ if (exp->helper) {
+ help = nf_ct_helper_ext_add(ct, exp->helper,
+ GFP_ATOMIC);
+ if (help)
+ rcu_assign_pointer(help->helper, exp->helper);
+ }
- if (exp) {
- DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
- conntrack, exp);
- /* Welcome, Mr. Bond. We've been expecting you... */
- __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
- conntrack->master = exp->master;
#ifdef CONFIG_NF_CONNTRACK_MARK
- conntrack->mark = exp->master->mark;
+ ct->mark = exp->master->mark;
#endif
- nf_conntrack_get(&conntrack->master->ct_general);
- NF_CT_STAT_INC(expect_new);
- } else {
- conntrack->helper = __nf_ct_helper_find(&repl_tuple);
-
- NF_CT_STAT_INC(new);
- }
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ ct->secmark = exp->master->secmark;
+#endif
+ NF_CT_STAT_INC(net, expect_new);
+ }
+ spin_unlock(&nf_conntrack_expect_lock);
+ }
+ if (!exp) {
+ __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
+ NF_CT_STAT_INC(net, new);
+ }
- /* Overload tuple linked list to put us in unconfirmed list. */
- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+ /* Now it is inserted into the unconfirmed list, bump refcount */
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_add_to_unconfirmed_list(ct);
- write_unlock_bh(&nf_conntrack_lock);
+ local_bh_enable();
if (exp) {
if (exp->expectfn)
- exp->expectfn(conntrack, exp);
- nf_conntrack_expect_put(exp);
+ exp->expectfn(ct, exp);
+ nf_ct_expect_put(exp);
}
- return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
+ return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}
/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
static inline struct nf_conn *
-resolve_normal_ct(struct sk_buff *skb,
+resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
struct nf_conntrack_l3proto *l3proto,
- struct nf_conntrack_protocol *proto,
+ struct nf_conntrack_l4proto *l4proto,
int *set_reply,
enum ip_conntrack_info *ctinfo)
{
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 hash;
- if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
- proto)) {
- DEBUGP("resolve_normal_ct: Can't get tuple\n");
+ l4proto)) {
+ pr_debug("resolve_normal_ct: Can't get tuple\n");
return NULL;
}
/* look for tuple match */
- h = nf_conntrack_find_get(&tuple, NULL);
+ hash = hash_conntrack_raw(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
- h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
+ h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+ skb, dataoff, hash);
if (!h)
return NULL;
if (IS_ERR(h))
@@ -1027,19 +1066,20 @@ resolve_normal_ct(struct sk_buff *skb,
/* It exists; we have (non-exclusive) reference. */
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
- *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ *ctinfo = IP_CT_ESTABLISHED_REPLY;
/* Please set reply bit if this packet OK */
*set_reply = 1;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: related packet for %p\n",
+ ct);
*ctinfo = IP_CT_RELATED;
} else {
- DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: new packet for %p\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1050,328 +1090,149 @@ resolve_normal_ct(struct sk_buff *skb,
}
unsigned int
-nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
+nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
+ struct sk_buff *skb)
{
- struct nf_conn *ct;
+ struct nf_conn *ct, *tmpl = NULL;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_l3proto *l3proto;
- struct nf_conntrack_protocol *proto;
+ struct nf_conntrack_l4proto *l4proto;
+ unsigned int *timeouts;
unsigned int dataoff;
u_int8_t protonum;
int set_reply = 0;
int ret;
- /* Previously seen (loopback or untracked)? Ignore. */
- if ((*pskb)->nfct) {
- NF_CT_STAT_INC(ignore);
- return NF_ACCEPT;
+ if (skb->nfct) {
+ /* Previously seen (loopback or untracked)? Ignore. */
+ tmpl = (struct nf_conn *)skb->nfct;
+ if (!nf_ct_is_template(tmpl)) {
+ NF_CT_STAT_INC_ATOMIC(net, ignore);
+ return NF_ACCEPT;
+ }
+ skb->nfct = NULL;
}
- l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
- if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
- DEBUGP("not prepared to track yet or error occured\n");
- return -ret;
+ /* rcu_read_lock()ed by nf_hook_slow */
+ l3proto = __nf_ct_l3proto_find(pf);
+ ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
+ &dataoff, &protonum);
+ if (ret <= 0) {
+ pr_debug("not prepared to track yet or error occurred\n");
+ NF_CT_STAT_INC_ATOMIC(net, error);
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = -ret;
+ goto out;
}
- proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
+ l4proto = __nf_ct_l4proto_find(pf, protonum);
/* It may be an special packet, error, unclean...
* inverse of the return code tells to the netfilter
* core what to do with the packet. */
- if (proto->error != NULL &&
- (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
- NF_CT_STAT_INC(error);
- NF_CT_STAT_INC(invalid);
- return -ret;
+ if (l4proto->error != NULL) {
+ ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
+ pf, hooknum);
+ if (ret <= 0) {
+ NF_CT_STAT_INC_ATOMIC(net, error);
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = -ret;
+ goto out;
+ }
+ /* ICMP[v6] protocol trackers may assign one conntrack. */
+ if (skb->nfct)
+ goto out;
}
- ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
- &set_reply, &ctinfo);
+ ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
+ l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
- NF_CT_STAT_INC(invalid);
- return NF_ACCEPT;
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = NF_ACCEPT;
+ goto out;
}
if (IS_ERR(ct)) {
/* Too stressed to deal. */
- NF_CT_STAT_INC(drop);
- return NF_DROP;
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ ret = NF_DROP;
+ goto out;
}
- NF_CT_ASSERT((*pskb)->nfct);
+ NF_CT_ASSERT(skb->nfct);
- ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
- if (ret < 0) {
+ /* Decide what timeout policy we want to apply to this flow. */
+ timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
+
+ ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
+ if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
- DEBUGP("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put((*pskb)->nfct);
- (*pskb)->nfct = NULL;
- NF_CT_STAT_INC(invalid);
- return -ret;
+ pr_debug("nf_conntrack_in: Can't track with proto module\n");
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ if (ret == -NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ ret = -ret;
+ goto out;
}
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
- nf_conntrack_event_cache(IPCT_STATUS, *pskb);
-
- return ret;
-}
-
-int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
- const struct nf_conntrack_tuple *orig)
-{
- return nf_ct_invert_tuple(inverse, orig,
- __nf_ct_l3proto_find(orig->src.l3num),
- __nf_ct_proto_find(orig->src.l3num,
- orig->dst.protonum));
-}
-
-/* Would two expected things clash? */
-static inline int expect_clash(const struct nf_conntrack_expect *a,
- const struct nf_conntrack_expect *b)
-{
- /* Part covered by intersection of masks must be unequal,
- otherwise they clash */
- struct nf_conntrack_tuple intersect_mask;
- int count;
-
- intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
- intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
- intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
- intersect_mask.dst.protonum = a->mask.dst.protonum
- & b->mask.dst.protonum;
-
- for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
- intersect_mask.src.u3.all[count] =
- a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
- }
-
- for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
- intersect_mask.dst.u3.all[count] =
- a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
- }
-
- return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
-}
-
-static inline int expect_matches(const struct nf_conntrack_expect *a,
- const struct nf_conntrack_expect *b)
-{
- return a->master == b->master
- && nf_ct_tuple_equal(&a->tuple, &b->tuple)
- && nf_ct_tuple_equal(&a->mask, &b->mask);
-}
-
-/* Generally a bad idea to call this: could have matched already. */
-void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
-{
- struct nf_conntrack_expect *i;
-
- write_lock_bh(&nf_conntrack_lock);
- /* choose the the oldest expectation to evict */
- list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
- if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- nf_ct_unlink_expect(i);
- write_unlock_bh(&nf_conntrack_lock);
- nf_conntrack_expect_put(i);
- return;
- }
- }
- write_unlock_bh(&nf_conntrack_lock);
-}
-
-/* We don't increase the master conntrack refcount for non-fulfilled
- * conntracks. During the conntrack destruction, the expectations are
- * always killed before the conntrack itself */
-struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
-{
- struct nf_conntrack_expect *new;
-
- new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
- if (!new) {
- DEBUGP("expect_related: OOM allocating expect\n");
- return NULL;
- }
- new->master = me;
- atomic_set(&new->use, 1);
- return new;
-}
-
-void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
-{
- if (atomic_dec_and_test(&exp->use))
- kmem_cache_free(nf_conntrack_expect_cachep, exp);
-}
-
-static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
-{
- atomic_inc(&exp->use);
- exp->master->expecting++;
- list_add(&exp->list, &nf_conntrack_expect_list);
-
- init_timer(&exp->timeout);
- exp->timeout.data = (unsigned long)exp;
- exp->timeout.function = expectation_timed_out;
- exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
- add_timer(&exp->timeout);
-
- exp->id = ++nf_conntrack_expect_next_id;
- atomic_inc(&exp->use);
- NF_CT_STAT_INC(expect_create);
-}
-
-/* Race with expectations being used means we could have none to find; OK. */
-static void evict_oldest_expect(struct nf_conn *master)
-{
- struct nf_conntrack_expect *i;
-
- list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
- if (i->master == master) {
- if (del_timer(&i->timeout)) {
- nf_ct_unlink_expect(i);
- nf_conntrack_expect_put(i);
- }
- break;
- }
+ nf_conntrack_event_cache(IPCT_REPLY, ct);
+out:
+ if (tmpl) {
+ /* Special case: we have to repeat this hook, assign the
+ * template again to this packet. We assume that this packet
+ * has no conntrack assigned. This is used by nf_ct_tcp. */
+ if (ret == NF_REPEAT)
+ skb->nfct = (struct nf_conntrack *)tmpl;
+ else
+ nf_ct_put(tmpl);
}
-}
-static inline int refresh_timer(struct nf_conntrack_expect *i)
-{
- if (!del_timer(&i->timeout))
- return 0;
-
- i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
- add_timer(&i->timeout);
- return 1;
+ return ret;
}
+EXPORT_SYMBOL_GPL(nf_conntrack_in);
-int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
+bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
+ const struct nf_conntrack_tuple *orig)
{
- struct nf_conntrack_expect *i;
- struct nf_conn *master = expect->master;
- int ret;
+ bool ret;
- DEBUGP("nf_conntrack_expect_related %p\n", related_to);
- DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
- DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
-
- write_lock_bh(&nf_conntrack_lock);
- list_for_each_entry(i, &nf_conntrack_expect_list, list) {
- if (expect_matches(i, expect)) {
- /* Refresh timer: if it's dying, ignore.. */
- if (refresh_timer(i)) {
- ret = 0;
- goto out;
- }
- } else if (expect_clash(i, expect)) {
- ret = -EBUSY;
- goto out;
- }
- }
- /* Will be over limit? */
- if (master->helper->max_expected &&
- master->expecting >= master->helper->max_expected)
- evict_oldest_expect(master);
-
- nf_conntrack_expect_insert(expect);
- nf_conntrack_expect_event(IPEXP_NEW, expect);
- ret = 0;
-out:
- write_unlock_bh(&nf_conntrack_lock);
+ rcu_read_lock();
+ ret = nf_ct_invert_tuple(inverse, orig,
+ __nf_ct_l3proto_find(orig->src.l3num),
+ __nf_ct_l4proto_find(orig->src.l3num,
+ orig->dst.protonum));
+ rcu_read_unlock();
return ret;
}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
/* Alter reply tuple (maybe alter helper). This is for NAT, and is
implicitly racy: see __nf_conntrack_confirm */
-void nf_conntrack_alter_reply(struct nf_conn *conntrack,
+void nf_conntrack_alter_reply(struct nf_conn *ct,
const struct nf_conntrack_tuple *newreply)
{
- write_lock_bh(&nf_conntrack_lock);
- /* Should be unconfirmed, so not in hash table yet */
- NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
-
- DEBUGP("Altering reply tuple of %p to ", conntrack);
- NF_CT_DUMP_TUPLE(newreply);
-
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (!conntrack->master && conntrack->expecting == 0)
- conntrack->helper = __nf_ct_helper_find(newreply);
- write_unlock_bh(&nf_conntrack_lock);
-}
-
-int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
-{
- int ret;
- BUG_ON(me->timeout == 0);
-
- ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
- sizeof(struct nf_conn)
- + sizeof(union nf_conntrack_help)
- + __alignof__(union nf_conntrack_help),
- init_conntrack_for_helper);
- if (ret < 0) {
- printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
- return ret;
- }
- write_lock_bh(&nf_conntrack_lock);
- list_prepend(&helpers, me);
- write_unlock_bh(&nf_conntrack_lock);
-
- return 0;
-}
-
-struct nf_conntrack_helper *
-__nf_conntrack_helper_find_byname(const char *name)
-{
- struct nf_conntrack_helper *h;
-
- list_for_each_entry(h, &helpers, list) {
- if (!strcmp(h->name, name))
- return h;
- }
-
- return NULL;
-}
-
-static inline int unhelp(struct nf_conntrack_tuple_hash *i,
- const struct nf_conntrack_helper *me)
-{
- if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
- nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
- nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
- }
- return 0;
-}
+ struct nf_conn_help *help = nfct_help(ct);
-void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
-{
- unsigned int i;
- struct nf_conntrack_expect *exp, *tmp;
-
- /* Need write lock here, to delete helper. */
- write_lock_bh(&nf_conntrack_lock);
- LIST_DELETE(&helpers, me);
+ /* Should be unconfirmed, so not in hash table yet */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- /* Get rid of expectations */
- list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
- if (exp->master->helper == me && del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
- nf_conntrack_expect_put(exp);
- }
- }
+ pr_debug("Altering reply tuple of %p to ", ct);
+ nf_ct_dump_tuple(newreply);
- /* Get rid of expecteds, set helpers to NULL. */
- LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
- for (i = 0; i < nf_conntrack_htable_size; i++)
- LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
- struct nf_conntrack_tuple_hash *, me);
- write_unlock_bh(&nf_conntrack_lock);
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ if (ct->master || (help && !hlist_empty(&help->expectations)))
+ return;
- /* Someone could be still looking at the helper in a bh. */
- synchronize_net();
+ rcu_read_lock();
+ __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
@@ -1380,91 +1241,125 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
unsigned long extra_jiffies,
int do_acct)
{
- int event = 0;
-
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);
- write_lock_bh(&nf_conntrack_lock);
+ /* Only update if this is not a fixed timeout */
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ goto acct;
/* If not in hash table, timer will not be active yet */
if (!nf_ct_is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- event = IPCT_REFRESH;
} else {
- /* Need del_timer for race avoidance (may already be dying). */
- if (del_timer(&ct->timeout)) {
- ct->timeout.expires = jiffies + extra_jiffies;
- add_timer(&ct->timeout);
- event = IPCT_REFRESH;
- }
+ unsigned long newtime = jiffies + extra_jiffies;
+
+ /* Only update the timeout if the new timeout is at least
+ HZ jiffies from the old timeout. Need del_timer for race
+ avoidance (may already be dying). */
+ if (newtime - ct->timeout.expires >= HZ)
+ mod_timer_pending(&ct->timeout, newtime);
}
-#ifdef CONFIG_NF_CT_ACCT
+acct:
if (do_acct) {
- ct->counters[CTINFO2DIR(ctinfo)].packets++;
- ct->counters[CTINFO2DIR(ctinfo)].bytes +=
- skb->len - (unsigned int)(skb->nh.raw - skb->data);
- if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
- || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
- event |= IPCT_COUNTER_FILLING;
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
}
-#endif
+}
+EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
+
+bool __nf_ct_kill_acct(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ int do_acct)
+{
+ if (do_acct) {
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
- write_unlock_bh(&nf_conntrack_lock);
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len - skb_network_offset(skb),
+ &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+ }
- /* must be unlocked when calling event cache */
- if (event)
- nf_conntrack_event_cache(event, skb);
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.function((unsigned long)ct);
+ return true;
+ }
+ return false;
}
+EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
+ .len = sizeof(struct nf_conntrack_zone),
+ .align = __alignof__(struct nf_conntrack_zone),
+ .id = NF_CT_EXT_ZONE,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/mutex.h>
/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
* in ip_conntrack_core, since we don't want the protocols to autoload
* or depend on ctnetlink */
-int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
- &tuple->src.u.tcp.port);
- NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
- &tuple->dst.u.tcp.port);
+ if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
+ nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
+EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
- [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
+const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
+ [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
};
+EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
-int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
struct nf_conntrack_tuple *t)
{
- if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
return -EINVAL;
- t->src.u.tcp.port =
- *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
- t->dst.u.tcp.port =
- *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+ t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+ t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
return 0;
}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
+
+int nf_ct_port_nlattr_tuple_size(void)
+{
+ return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
#endif
/* Used by ipt_REJECT and ip6t_REJECT. */
-void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -1472,7 +1367,7 @@ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
/* This ICMP is in reverse direction to the packet which caused it */
ct = nf_ct_get(skb, &ctinfo);
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
- ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+ ctinfo = IP_CT_RELATED_REPLY;
else
ctinfo = IP_CT_RELATED;
@@ -1482,257 +1377,501 @@ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
nf_conntrack_get(nskb->nfct);
}
-static inline int
-do_iter(const struct nf_conntrack_tuple_hash *i,
- int (*iter)(struct nf_conn *i, void *data),
- void *data)
-{
- return iter(nf_ct_tuplehash_to_ctrack(i), data);
-}
-
/* Bring out ya dead! */
-static struct nf_conntrack_tuple_hash *
-get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
+static struct nf_conn *
+get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
void *data, unsigned int *bucket)
{
- struct nf_conntrack_tuple_hash *h = NULL;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+ int cpu;
+ spinlock_t *lockp;
+
+ for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
+ local_bh_disable();
+ spin_lock(lockp);
+ if (*bucket < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (iter(ct, data))
+ goto found;
+ }
+ }
+ spin_unlock(lockp);
+ local_bh_enable();
+ }
- write_lock_bh(&nf_conntrack_lock);
- for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
- h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
- struct nf_conntrack_tuple_hash *, iter, data);
- if (h)
- break;
- }
- if (!h)
- h = LIST_FIND_W(&unconfirmed, do_iter,
- struct nf_conntrack_tuple_hash *, iter, data);
- if (h)
- atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
- write_unlock_bh(&nf_conntrack_lock);
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- return h;
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (iter(ct, data))
+ set_bit(IPS_DYING_BIT, &ct->status);
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+ return NULL;
+found:
+ atomic_inc(&ct->ct_general.use);
+ spin_unlock(lockp);
+ local_bh_enable();
+ return ct;
}
-void
-nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
+void nf_ct_iterate_cleanup(struct net *net,
+ int (*iter)(struct nf_conn *i, void *data),
+ void *data, u32 portid, int report)
{
- struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
unsigned int bucket = 0;
- while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
- death_by_timeout((unsigned long)ct);
+ nf_ct_delete(ct, portid, report);
+
/* ... else the timer will get him soon. */
nf_ct_put(ct);
}
}
+EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
static int kill_all(struct nf_conn *i, void *data)
{
return 1;
}
-static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
+void nf_ct_free_hashtable(void *hash, unsigned int size)
{
- if (vmalloced)
+ if (is_vmalloc_addr(hash))
vfree(hash);
else
- free_pages((unsigned long)hash,
- get_order(sizeof(struct list_head) * size));
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct hlist_head) * size));
}
+EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-void nf_conntrack_flush()
+void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
{
- nf_ct_iterate_cleanup(kill_all, NULL);
+ nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report);
}
+EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void nf_conntrack_cleanup(void)
+static void nf_ct_release_dying_list(struct net *net)
{
- int i;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+ int cpu;
- ip_ct_attach = NULL;
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- /* This makes sure all current packets have passed through
- netfilter framework. Roll on, two-stage module
- delete... */
- synchronize_net();
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* never fails to remove them, no listeners at this point */
+ nf_ct_kill(ct);
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+}
- nf_ct_event_cache_flush();
- i_see_dead_people:
- nf_conntrack_flush();
- if (atomic_read(&nf_conntrack_count) != 0) {
- schedule();
- goto i_see_dead_people;
+static int untrack_refs(void)
+{
+ int cnt = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+
+ cnt += atomic_read(&ct->ct_general.use) - 1;
}
- /* wait until all references to nf_conntrack_untracked are dropped */
- while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ return cnt;
+}
+
+void nf_conntrack_cleanup_start(void)
+{
+ RCU_INIT_POINTER(ip_ct_attach, NULL);
+}
+
+void nf_conntrack_cleanup_end(void)
+{
+ RCU_INIT_POINTER(nf_ct_destroy, NULL);
+ while (untrack_refs() > 0)
schedule();
- for (i = 0; i < NF_CT_F_NUM; i++) {
- if (nf_ct_cache[i].use == 0)
- continue;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ nf_ct_extend_unregister(&nf_ct_zone_extend);
+#endif
+ nf_conntrack_proto_fini();
+ nf_conntrack_seqadj_fini();
+ nf_conntrack_labels_fini();
+ nf_conntrack_helper_fini();
+ nf_conntrack_timeout_fini();
+ nf_conntrack_ecache_fini();
+ nf_conntrack_tstamp_fini();
+ nf_conntrack_acct_fini();
+ nf_conntrack_expect_fini();
+}
- NF_CT_ASSERT(nf_ct_cache[i].use == 1);
- nf_ct_cache[i].use = 1;
- nf_conntrack_unregister_cache(i);
+/*
+ * Mishearing the voices in his head, our hero wonders how he's
+ * supposed to kill the mall.
+ */
+void nf_conntrack_cleanup_net(struct net *net)
+{
+ LIST_HEAD(single);
+
+ list_add(&net->exit_list, &single);
+ nf_conntrack_cleanup_net_list(&single);
+}
+
+void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
+{
+ int busy;
+ struct net *net;
+
+ /*
+ * This makes sure all current packets have passed through
+ * netfilter framework. Roll on, two-stage module
+ * delete...
+ */
+ synchronize_net();
+i_see_dead_people:
+ busy = 0;
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
+ nf_ct_release_dying_list(net);
+ if (atomic_read(&net->ct.count) != 0)
+ busy = 1;
+ }
+ if (busy) {
+ schedule();
+ goto i_see_dead_people;
+ }
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_proto_pernet_fini(net);
+ nf_conntrack_helper_pernet_fini(net);
+ nf_conntrack_ecache_pernet_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+ nf_conntrack_acct_pernet_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
+ free_percpu(net->ct.stat);
+ free_percpu(net->ct.pcpu_lists);
}
- kmem_cache_destroy(nf_conntrack_expect_cachep);
- free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
- nf_conntrack_htable_size);
-
- /* free l3proto protocol tables */
- for (i = 0; i < PF_MAX; i++)
- if (nf_ct_protos[i]) {
- kfree(nf_ct_protos[i]);
- nf_ct_protos[i] = NULL;
- }
}
-static struct list_head *alloc_hashtable(int size, int *vmalloced)
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
- struct list_head *hash;
- unsigned int i;
+ struct hlist_nulls_head *hash;
+ unsigned int nr_slots, i;
+ size_t sz;
- *vmalloced = 0;
- hash = (void*)__get_free_pages(GFP_KERNEL,
- get_order(sizeof(struct list_head)
- * size));
- if (!hash) {
- *vmalloced = 1;
+ BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
+ nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+ sz = nr_slots * sizeof(struct hlist_nulls_head);
+ hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ get_order(sz));
+ if (!hash) {
printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
- hash = vmalloc(sizeof(struct list_head) * size);
+ hash = vzalloc(sz);
}
- if (hash)
- for (i = 0; i < size; i++)
- INIT_LIST_HEAD(&hash[i]);
+ if (hash && nulls)
+ for (i = 0; i < nr_slots; i++)
+ INIT_HLIST_NULLS_HEAD(&hash[i], i);
return hash;
}
+EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
-int set_hashsize(const char *val, struct kernel_param *kp)
+int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
{
- int i, bucket, hashsize, vmalloced;
- int old_vmalloced, old_size;
- int rnd;
- struct list_head *hash, *old_hash;
+ int i, bucket, rc;
+ unsigned int hashsize, old_size;
+ struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+
+ if (current->nsproxy->net_ns != &init_net)
+ return -EOPNOTSUPP;
/* On boot, we can set this without any fancy locking. */
if (!nf_conntrack_htable_size)
return param_set_uint(val, kp);
- hashsize = simple_strtol(val, NULL, 0);
+ rc = kstrtouint(val, 0, &hashsize);
+ if (rc)
+ return rc;
if (!hashsize)
return -EINVAL;
- hash = alloc_hashtable(hashsize, &vmalloced);
+ hash = nf_ct_alloc_hashtable(&hashsize, 1);
if (!hash)
return -ENOMEM;
- /* We have to rehahs for the new table anyway, so we also can
- * use a newrandom seed */
- get_random_bytes(&rnd, 4);
-
- write_lock_bh(&nf_conntrack_lock);
- for (i = 0; i < nf_conntrack_htable_size; i++) {
- while (!list_empty(&nf_conntrack_hash[i])) {
- h = list_entry(nf_conntrack_hash[i].next,
- struct nf_conntrack_tuple_hash, list);
- list_del(&h->list);
- bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
- list_add_tail(&h->list, &hash[bucket]);
+ local_bh_disable();
+ nf_conntrack_all_lock();
+ write_seqcount_begin(&init_net.ct.generation);
+
+ /* Lookups in the old hash might happen in parallel, which means we
+ * might get false negatives during connection lookup. New connections
+ * created because of a false negative won't make it into the hash
+ * though since that required taking the locks.
+ */
+
+ for (i = 0; i < init_net.ct.htable_size; i++) {
+ while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
+ h = hlist_nulls_entry(init_net.ct.hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ hlist_nulls_del_rcu(&h->hnnode);
+ bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
+ hashsize);
+ hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = nf_conntrack_htable_size;
- old_vmalloced = nf_conntrack_vmalloc;
- old_hash = nf_conntrack_hash;
+ old_size = init_net.ct.htable_size;
+ old_hash = init_net.ct.hash;
- nf_conntrack_htable_size = hashsize;
- nf_conntrack_vmalloc = vmalloced;
- nf_conntrack_hash = hash;
- nf_conntrack_hash_rnd = rnd;
- write_unlock_bh(&nf_conntrack_lock);
+ init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
+ init_net.ct.hash = hash;
+
+ write_seqcount_end(&init_net.ct.generation);
+ nf_conntrack_all_unlock();
+ local_bh_enable();
- free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
+EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
-module_param_call(hashsize, set_hashsize, param_get_uint,
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
-int __init nf_conntrack_init(void)
+void nf_ct_untracked_status_or(unsigned long bits)
{
- unsigned int i;
- int ret;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(nf_conntrack_untracked, cpu).status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
+int nf_conntrack_init_start(void)
+{
+ int max_factor = 8;
+ int i, ret, cpu;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_init(&nf_conntrack_locks[i]);
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
- * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
+ * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
if (!nf_conntrack_htable_size) {
nf_conntrack_htable_size
- = (((num_physpages << PAGE_SHIFT) / 16384)
- / sizeof(struct list_head));
- if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
- nf_conntrack_htable_size = 8192;
- if (nf_conntrack_htable_size < 16)
- nf_conntrack_htable_size = 16;
+ = (((totalram_pages << PAGE_SHIFT) / 16384)
+ / sizeof(struct hlist_head));
+ if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ nf_conntrack_htable_size = 16384;
+ if (nf_conntrack_htable_size < 32)
+ nf_conntrack_htable_size = 32;
+
+ /* Use a max. factor of four by default to get the same max as
+ * with the old struct list_heads. When a table size is given
+ * we use the old value of 8 to avoid reducing the max.
+ * entries. */
+ max_factor = 4;
}
- nf_conntrack_max = 8 * nf_conntrack_htable_size;
+ nf_conntrack_max = max_factor * nf_conntrack_htable_size;
- printk("nf_conntrack version %s (%u buckets, %d max)\n",
+ printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
- nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
- &nf_conntrack_vmalloc);
- if (!nf_conntrack_hash) {
- printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
- goto err_out;
- }
+ ret = nf_conntrack_expect_init();
+ if (ret < 0)
+ goto err_expect;
- ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
- sizeof(struct nf_conn), NULL);
- if (ret < 0) {
- printk(KERN_ERR "Unable to create nf_conn slab cache\n");
- goto err_free_hash;
- }
+ ret = nf_conntrack_acct_init();
+ if (ret < 0)
+ goto err_acct;
- nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
- sizeof(struct nf_conntrack_expect),
- 0, 0, NULL, NULL);
- if (!nf_conntrack_expect_cachep) {
- printk(KERN_ERR "Unable to create nf_expect slab cache\n");
- goto err_free_conntrack_slab;
- }
+ ret = nf_conntrack_tstamp_init();
+ if (ret < 0)
+ goto err_tstamp;
- /* Don't NEED lock here, but good form anyway. */
- write_lock_bh(&nf_conntrack_lock);
- for (i = 0; i < PF_MAX; i++)
- nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
- write_unlock_bh(&nf_conntrack_lock);
+ ret = nf_conntrack_ecache_init();
+ if (ret < 0)
+ goto err_ecache;
- /* For use by REJECT target */
- ip_ct_attach = __nf_conntrack_attach;
+ ret = nf_conntrack_timeout_init();
+ if (ret < 0)
+ goto err_timeout;
+
+ ret = nf_conntrack_helper_init();
+ if (ret < 0)
+ goto err_helper;
+
+ ret = nf_conntrack_labels_init();
+ if (ret < 0)
+ goto err_labels;
- /* Set up fake conntrack:
- - to never be deleted, not in any hashes */
- atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+ ret = nf_conntrack_seqadj_init();
+ if (ret < 0)
+ goto err_seqadj;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ ret = nf_ct_extend_register(&nf_ct_zone_extend);
+ if (ret < 0)
+ goto err_extend;
+#endif
+ ret = nf_conntrack_proto_init();
+ if (ret < 0)
+ goto err_proto;
+
+ /* Set up fake conntrack: to never be deleted, not in any hashes */
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+ write_pnet(&ct->ct_net, &init_net);
+ atomic_set(&ct->ct_general.use, 1);
+ }
/* - and look it like as a confirmed connection */
- set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
+ nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+ return 0;
+err_proto:
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ nf_ct_extend_unregister(&nf_ct_zone_extend);
+err_extend:
+#endif
+ nf_conntrack_seqadj_fini();
+err_seqadj:
+ nf_conntrack_labels_fini();
+err_labels:
+ nf_conntrack_helper_fini();
+err_helper:
+ nf_conntrack_timeout_fini();
+err_timeout:
+ nf_conntrack_ecache_fini();
+err_ecache:
+ nf_conntrack_tstamp_fini();
+err_tstamp:
+ nf_conntrack_acct_fini();
+err_acct:
+ nf_conntrack_expect_fini();
+err_expect:
return ret;
+}
+
+void nf_conntrack_init_end(void)
+{
+ /* For use by REJECT target */
+ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
+ RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+}
+
+/*
+ * We need to use special "null" values, not used in hash table
+ */
+#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
+#define DYING_NULLS_VAL ((1<<30)+1)
+#define TEMPLATE_NULLS_VAL ((1<<30)+2)
+
+int nf_conntrack_init_net(struct net *net)
+{
+ int ret = -ENOMEM;
+ int cpu;
+
+ atomic_set(&net->ct.count, 0);
+ seqcount_init(&net->ct.generation);
+
+ net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
+ if (!net->ct.pcpu_lists)
+ goto err_stat;
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_init(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
+ }
+
+ net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+ if (!net->ct.stat)
+ goto err_pcpu_lists;
+
+ net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+ if (!net->ct.slabname)
+ goto err_slabname;
-err_free_conntrack_slab:
- nf_conntrack_unregister_cache(NF_CT_F_BASIC);
-err_free_hash:
- free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
- nf_conntrack_htable_size);
-err_out:
- return -ENOMEM;
+ net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!net->ct.nf_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ goto err_cache;
+ }
+
+ net->ct.htable_size = nf_conntrack_htable_size;
+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
+ if (!net->ct.hash) {
+ printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+ goto err_hash;
+ }
+ ret = nf_conntrack_expect_pernet_init(net);
+ if (ret < 0)
+ goto err_expect;
+ ret = nf_conntrack_acct_pernet_init(net);
+ if (ret < 0)
+ goto err_acct;
+ ret = nf_conntrack_tstamp_pernet_init(net);
+ if (ret < 0)
+ goto err_tstamp;
+ ret = nf_conntrack_ecache_pernet_init(net);
+ if (ret < 0)
+ goto err_ecache;
+ ret = nf_conntrack_helper_pernet_init(net);
+ if (ret < 0)
+ goto err_helper;
+ ret = nf_conntrack_proto_pernet_init(net);
+ if (ret < 0)
+ goto err_proto;
+ return 0;
+
+err_proto:
+ nf_conntrack_helper_pernet_fini(net);
+err_helper:
+ nf_conntrack_ecache_pernet_fini(net);
+err_ecache:
+ nf_conntrack_tstamp_pernet_fini(net);
+err_tstamp:
+ nf_conntrack_acct_pernet_fini(net);
+err_acct:
+ nf_conntrack_expect_pernet_fini(net);
+err_expect:
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+err_hash:
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
+ kfree(net->ct.slabname);
+err_slabname:
+ free_percpu(net->ct.stat);
+err_pcpu_lists:
+ free_percpu(net->ct.pcpu_lists);
+err_stat:
+ return ret;
}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
new file mode 100644
index 00000000000..1df17614656
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -0,0 +1,261 @@
+/* Event cache for netfilter. */
+
+/*
+ * (C) 2005 Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 Patrick McHardy <kaber@trash.net>
+ * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static DEFINE_MUTEX(nf_ct_ecache_mutex);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned long events, missed;
+ struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ int ret;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (notify == NULL)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ goto out_unlock;
+
+ events = xchg(&e->cache, 0);
+
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events)
+ goto out_unlock;
+
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ missed = e->missed;
+
+ if (!((events | missed) & e->ctmask))
+ goto out_unlock;
+
+ item.ct = ct;
+ item.portid = 0;
+ item.report = 0;
+
+ ret = notify->fcn(events | missed, &item);
+
+ if (likely(ret >= 0 && !missed))
+ goto out_unlock;
+
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+
+out_unlock:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+
+int nf_conntrack_register_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
+{
+ int ret;
+ struct nf_ct_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ if (notify != NULL) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
+ ret = 0;
+
+out_unlock:
+ mutex_unlock(&nf_ct_ecache_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+
+void nf_conntrack_unregister_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
+{
+ struct nf_ct_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ BUG_ON(notify != new);
+ RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
+ mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
+
+int nf_ct_expect_register_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
+{
+ int ret;
+ struct nf_exp_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ if (notify != NULL) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
+ ret = 0;
+
+out_unlock:
+ mutex_unlock(&nf_ct_ecache_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
+
+void nf_ct_expect_unregister_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
+{
+ struct nf_exp_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ BUG_ON(notify != new);
+ RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
+ mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#define NF_CT_EVENTS_DEFAULT 1
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_events",
+ .data = &init_net.ct.sysctl_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_events_retry_timeout",
+ .data = &init_net.ct.sysctl_events_retry_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+ .len = sizeof(struct nf_conntrack_ecache),
+ .align = __alignof__(struct nf_conntrack_ecache),
+ .id = NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_events;
+ table[1].data = &net->ct.sysctl_events_retry_timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.event_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+ if (!net->ct.event_sysctl_header) {
+ printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.event_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.event_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+int nf_conntrack_ecache_pernet_init(struct net *net)
+{
+ net->ct.sysctl_events = nf_ct_events;
+ net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
+ return nf_conntrack_event_init_sysctl(net);
+}
+
+void nf_conntrack_ecache_pernet_fini(struct net *net)
+{
+ nf_conntrack_event_fini_sysctl(net);
+}
+
+int nf_conntrack_ecache_init(void)
+{
+ int ret = nf_ct_extend_register(&event_extend);
+ if (ret < 0)
+ pr_err("nf_ct_event: Unable to register event extension.\n");
+ return ret;
+}
+
+void nf_conntrack_ecache_fini(void)
+{
+ nf_ct_extend_unregister(&event_extend);
+}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
new file mode 100644
index 00000000000..f87e8f68ad4
--- /dev/null
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -0,0 +1,656 @@
+/* Expectation handling for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+unsigned int nf_ct_expect_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+
+unsigned int nf_ct_expect_max __read_mostly;
+
+static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+
+/* nf_conntrack_expect helper functions */
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+ u32 portid, int report)
+{
+ struct nf_conn_help *master_help = nfct_help(exp->master);
+ struct net *net = nf_ct_exp_net(exp);
+
+ NF_CT_ASSERT(master_help);
+ NF_CT_ASSERT(!timer_pending(&exp->timeout));
+
+ hlist_del_rcu(&exp->hnode);
+ net->ct.expect_count--;
+
+ hlist_del(&exp->lnode);
+ master_help->expecting[exp->class]--;
+
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
+ nf_ct_expect_put(exp);
+
+ NF_CT_STAT_INC(net, expect_delete);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
+
+static void nf_ct_expectation_timed_out(unsigned long ul_expect)
+{
+ struct nf_conntrack_expect *exp = (void *)ul_expect;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ nf_ct_unlink_expect(exp);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ nf_ct_expect_put(exp);
+}
+
+static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+{
+ unsigned int hash;
+
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ init_nf_conntrack_hash_rnd();
+ }
+
+ hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
+ (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
+ (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+ return ((u64)hash * nf_ct_expect_hsize) >> 32;
+}
+
+struct nf_conntrack_expect *
+__nf_ct_expect_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i;
+ unsigned int h;
+
+ if (!net->ct.expect_count)
+ return NULL;
+
+ h = nf_ct_expect_dst_hash(tuple);
+ hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
+ if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ nf_ct_zone(i->master) == zone)
+ return i;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
+
+/* Just find a expectation corresponding to a tuple. */
+struct nf_conntrack_expect *
+nf_ct_expect_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i;
+
+ rcu_read_lock();
+ i = __nf_ct_expect_find(net, zone, tuple);
+ if (i && !atomic_inc_not_zero(&i->use))
+ i = NULL;
+ rcu_read_unlock();
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+struct nf_conntrack_expect *
+nf_ct_find_expectation(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i, *exp = NULL;
+ unsigned int h;
+
+ if (!net->ct.expect_count)
+ return NULL;
+
+ h = nf_ct_expect_dst_hash(tuple);
+ hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
+ if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
+ nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ nf_ct_zone(i->master) == zone) {
+ exp = i;
+ break;
+ }
+ }
+ if (!exp)
+ return NULL;
+
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (!nf_ct_is_confirmed(exp->master))
+ return NULL;
+
+ /* Avoid race with other CPUs, that for exp->master ct, is
+ * about to invoke ->destroy(), or nf_ct_delete() via timeout
+ * or early_drop().
+ *
+ * The atomic_inc_not_zero() check tells: If that fails, we
+ * know that the ct is being destroyed. If it succeeds, we
+ * can be sure the ct cannot disappear underneath.
+ */
+ if (unlikely(nf_ct_is_dying(exp->master) ||
+ !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ return NULL;
+
+ if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+ atomic_inc(&exp->use);
+ return exp;
+ } else if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ return exp;
+ }
+ /* Undo exp->master refcnt increase, if del_timer() failed */
+ nf_ct_put(exp->master);
+
+ return NULL;
+}
+
+/* delete all expectations for this conntrack */
+void nf_ct_remove_expectations(struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+
+ /* Optimization: most connection never expect any others. */
+ if (!help)
+ return;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ /* Part covered by intersection of masks must be unequal,
+ otherwise they clash */
+ struct nf_conntrack_tuple_mask intersect_mask;
+ int count;
+
+ intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
+
+ for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
+ intersect_mask.src.u3.all[count] =
+ a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
+ }
+
+ return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ return a->master == b->master && a->class == b->class &&
+ nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+ nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ nf_ct_zone(a->master) == nf_ct_zone(b->master);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
+
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
+{
+ struct nf_conntrack_expect *new;
+
+ new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ new->master = me;
+ atomic_set(&new->use, 1);
+ return new;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
+
+void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
+ u_int8_t family,
+ const union nf_inet_addr *saddr,
+ const union nf_inet_addr *daddr,
+ u_int8_t proto, const __be16 *src, const __be16 *dst)
+{
+ int len;
+
+ if (family == AF_INET)
+ len = 4;
+ else
+ len = 16;
+
+ exp->flags = 0;
+ exp->class = class;
+ exp->expectfn = NULL;
+ exp->helper = NULL;
+ exp->tuple.src.l3num = family;
+ exp->tuple.dst.protonum = proto;
+
+ if (saddr) {
+ memcpy(&exp->tuple.src.u3, saddr, len);
+ if (sizeof(exp->tuple.src.u3) > len)
+ /* address needs to be cleared for nf_ct_tuple_equal */
+ memset((void *)&exp->tuple.src.u3 + len, 0x00,
+ sizeof(exp->tuple.src.u3) - len);
+ memset(&exp->mask.src.u3, 0xFF, len);
+ if (sizeof(exp->mask.src.u3) > len)
+ memset((void *)&exp->mask.src.u3 + len, 0x00,
+ sizeof(exp->mask.src.u3) - len);
+ } else {
+ memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
+ memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
+ }
+
+ if (src) {
+ exp->tuple.src.u.all = *src;
+ exp->mask.src.u.all = htons(0xFFFF);
+ } else {
+ exp->tuple.src.u.all = 0;
+ exp->mask.src.u.all = 0;
+ }
+
+ memcpy(&exp->tuple.dst.u3, daddr, len);
+ if (sizeof(exp->tuple.dst.u3) > len)
+ /* address needs to be cleared for nf_ct_tuple_equal */
+ memset((void *)&exp->tuple.dst.u3 + len, 0x00,
+ sizeof(exp->tuple.dst.u3) - len);
+
+ exp->tuple.dst.u.all = *dst;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+ memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
+ memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_init);
+
+static void nf_ct_expect_free_rcu(struct rcu_head *head)
+{
+ struct nf_conntrack_expect *exp;
+
+ exp = container_of(head, struct nf_conntrack_expect, rcu);
+ kmem_cache_free(nf_ct_expect_cachep, exp);
+}
+
+void nf_ct_expect_put(struct nf_conntrack_expect *exp)
+{
+ if (atomic_dec_and_test(&exp->use))
+ call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_put);
+
+static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+{
+ struct nf_conn_help *master_help = nfct_help(exp->master);
+ struct nf_conntrack_helper *helper;
+ struct net *net = nf_ct_exp_net(exp);
+ unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+
+ /* two references : one for hash insert, one for the timer */
+ atomic_add(2, &exp->use);
+
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
+
+ hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+ net->ct.expect_count++;
+
+ setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
+ (unsigned long)exp);
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ exp->timeout.expires = jiffies +
+ helper->expect_policy[exp->class].timeout * HZ;
+ }
+ add_timer(&exp->timeout);
+
+ NF_CT_STAT_INC(net, expect_create);
+ return 0;
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct nf_conn *master,
+ struct nf_conntrack_expect *new)
+{
+ struct nf_conn_help *master_help = nfct_help(master);
+ struct nf_conntrack_expect *exp, *last = NULL;
+
+ hlist_for_each_entry(exp, &master_help->expectations, lnode) {
+ if (exp->class == new->class)
+ last = exp;
+ }
+
+ if (last && del_timer(&last->timeout)) {
+ nf_ct_unlink_expect(last);
+ nf_ct_expect_put(last);
+ }
+}
+
+static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
+{
+ const struct nf_conntrack_expect_policy *p;
+ struct nf_conntrack_expect *i;
+ struct nf_conn *master = expect->master;
+ struct nf_conn_help *master_help = nfct_help(master);
+ struct nf_conntrack_helper *helper;
+ struct net *net = nf_ct_exp_net(expect);
+ struct hlist_node *next;
+ unsigned int h;
+ int ret = 1;
+
+ if (!master_help) {
+ ret = -ESHUTDOWN;
+ goto out;
+ }
+ h = nf_ct_expect_dst_hash(&expect->tuple);
+ hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
+ if (expect_matches(i, expect)) {
+ if (del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ nf_ct_expect_put(i);
+ break;
+ }
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+ /* Will be over limit? */
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ p = &helper->expect_policy[expect->class];
+ if (p->max_expected &&
+ master_help->expecting[expect->class] >= p->max_expected) {
+ evict_oldest_expect(master, expect);
+ if (master_help->expecting[expect->class]
+ >= p->max_expected) {
+ ret = -EMFILE;
+ goto out;
+ }
+ }
+ }
+
+ if (net->ct.expect_count >= nf_ct_expect_max) {
+ net_warn_ratelimited("nf_conntrack: expectation table full\n");
+ ret = -EMFILE;
+ }
+out:
+ return ret;
+}
+
+int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+ u32 portid, int report)
+{
+ int ret;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ ret = __nf_ct_expect_check(expect);
+ if (ret <= 0)
+ goto out;
+
+ ret = nf_ct_expect_insert(expect);
+ if (ret < 0)
+ goto out;
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
+ return ret;
+out:
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+struct ct_expect_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+ struct hlist_node *n;
+
+ for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ if (n)
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ struct hlist_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_next_rcu(head));
+ while (head == NULL) {
+ if (++st->bucket >= nf_ct_expect_hsize)
+ return NULL;
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head = ct_expect_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_expect_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_expect *expect;
+ struct nf_conntrack_helper *helper;
+ struct hlist_node *n = v;
+ char *delim = "";
+
+ expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+ if (expect->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+ ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+ seq_printf(s, "l3proto = %u proto=%u ",
+ expect->tuple.src.l3num,
+ expect->tuple.dst.protonum);
+ print_tuple(s, &expect->tuple,
+ __nf_ct_l3proto_find(expect->tuple.src.l3num),
+ __nf_ct_l4proto_find(expect->tuple.src.l3num,
+ expect->tuple.dst.protonum));
+
+ if (expect->flags & NF_CT_EXPECT_PERMANENT) {
+ seq_printf(s, "PERMANENT");
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_INACTIVE) {
+ seq_printf(s, "%sINACTIVE", delim);
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_USERSPACE)
+ seq_printf(s, "%sUSERSPACE", delim);
+
+ helper = rcu_dereference(nfct_help(expect->master)->helper);
+ if (helper) {
+ seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
+ if (helper->expect_policy[expect->class].name)
+ seq_printf(s, "/%s",
+ helper->expect_policy[expect->class].name);
+ }
+
+ return seq_putc(s, '\n');
+}
+
+static const struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &exp_seq_ops,
+ sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+
+static int exp_proc_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+ struct proc_dir_entry *proc;
+
+ proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
+ &exp_file_ops);
+ if (!proc)
+ return -ENOMEM;
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+ return 0;
+}
+
+static void exp_proc_remove(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+ remove_proc_entry("nf_conntrack_expect", net->proc_net);
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+}
+
+module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
+
+int nf_conntrack_expect_pernet_init(struct net *net)
+{
+ int err = -ENOMEM;
+
+ net->ct.expect_count = 0;
+ net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+ if (net->ct.expect_hash == NULL)
+ goto err1;
+
+ err = exp_proc_init(net);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+err2:
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+err1:
+ return err;
+}
+
+void nf_conntrack_expect_pernet_fini(struct net *net)
+{
+ exp_proc_remove(net);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+}
+
+int nf_conntrack_expect_init(void)
+{
+ if (!nf_ct_expect_hsize) {
+ nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
+ if (!nf_ct_expect_hsize)
+ nf_ct_expect_hsize = 1;
+ }
+ nf_ct_expect_max = nf_ct_expect_hsize * 4;
+ nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+ sizeof(struct nf_conntrack_expect),
+ 0, 0, NULL);
+ if (!nf_ct_expect_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void nf_conntrack_expect_fini(void)
+{
+ rcu_barrier(); /* Wait for call_rcu() before destroy */
+ kmem_cache_destroy(nf_ct_expect_cachep);
+}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
new file mode 100644
index 00000000000..1a9545965c0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -0,0 +1,191 @@
+/* Structure dynamic extension infrastructure
+ * Copyright (C) 2004 Rusty Russell IBM Corporation
+ * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
+static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+
+void __nf_ct_ext_destroy(struct nf_conn *ct)
+{
+ unsigned int i;
+ struct nf_ct_ext_type *t;
+ struct nf_ct_ext *ext = ct->ext;
+
+ for (i = 0; i < NF_CT_EXT_NUM; i++) {
+ if (!__nf_ct_ext_exist(ext, i))
+ continue;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[i]);
+
+ /* Here the nf_ct_ext_type might have been unregisterd.
+ * I.e., it has responsible to cleanup private
+ * area in all conntracks when it is unregisterd.
+ */
+ if (t && t->destroy)
+ t->destroy(ct);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(__nf_ct_ext_destroy);
+
+static void *
+nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
+{
+ unsigned int off, len;
+ struct nf_ct_ext_type *t;
+ size_t alloc_size;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[id]);
+ BUG_ON(t == NULL);
+ off = ALIGN(sizeof(struct nf_ct_ext), t->align);
+ len = off + t->len + var_alloc_len;
+ alloc_size = t->alloc_size + var_alloc_len;
+ rcu_read_unlock();
+
+ *ext = kzalloc(alloc_size, gfp);
+ if (!*ext)
+ return NULL;
+
+ (*ext)->offset[id] = off;
+ (*ext)->len = len;
+
+ return (void *)(*ext) + off;
+}
+
+void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
+{
+ struct nf_ct_ext *old, *new;
+ int i, newlen, newoff;
+ struct nf_ct_ext_type *t;
+
+ /* Conntrack must not be confirmed to avoid races on reallocation. */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+
+ old = ct->ext;
+ if (!old)
+ return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp);
+
+ if (__nf_ct_ext_exist(old, id))
+ return NULL;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[id]);
+ BUG_ON(t == NULL);
+
+ newoff = ALIGN(old->len, t->align);
+ newlen = newoff + t->len + var_alloc_len;
+ rcu_read_unlock();
+
+ new = __krealloc(old, newlen, gfp);
+ if (!new)
+ return NULL;
+
+ if (new != old) {
+ for (i = 0; i < NF_CT_EXT_NUM; i++) {
+ if (!__nf_ct_ext_exist(old, i))
+ continue;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[i]);
+ if (t && t->move)
+ t->move((void *)new + new->offset[i],
+ (void *)old + old->offset[i]);
+ rcu_read_unlock();
+ }
+ kfree_rcu(old, rcu);
+ ct->ext = new;
+ }
+
+ new->offset[id] = newoff;
+ new->len = newlen;
+ memset((void *)new + newoff, 0, newlen - newoff);
+ return (void *)new + newoff;
+}
+EXPORT_SYMBOL(__nf_ct_ext_add_length);
+
+static void update_alloc_size(struct nf_ct_ext_type *type)
+{
+ int i, j;
+ struct nf_ct_ext_type *t1, *t2;
+ enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1;
+
+ /* unnecessary to update all types */
+ if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) {
+ min = type->id;
+ max = type->id;
+ }
+
+ /* This assumes that extended areas in conntrack for the types
+ whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
+ for (i = min; i <= max; i++) {
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
+ if (!t1)
+ continue;
+
+ t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+ t1->len;
+ for (j = 0; j < NF_CT_EXT_NUM; j++) {
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
+ if (t2 == NULL || t2 == t1 ||
+ (t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
+ continue;
+
+ t1->alloc_size = ALIGN(t1->alloc_size, t2->align)
+ + t2->len;
+ }
+ }
+}
+
+/* This MUST be called in process context. */
+int nf_ct_extend_register(struct nf_ct_ext_type *type)
+{
+ int ret = 0;
+
+ mutex_lock(&nf_ct_ext_type_mutex);
+ if (nf_ct_ext_types[type->id]) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* This ensures that nf_ct_ext_create() can allocate enough area
+ before updating alloc_size */
+ type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align)
+ + type->len;
+ rcu_assign_pointer(nf_ct_ext_types[type->id], type);
+ update_alloc_size(type);
+out:
+ mutex_unlock(&nf_ct_ext_type_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+
+/* This MUST be called in process context. */
+void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
+{
+ mutex_lock(&nf_ct_ext_type_mutex);
+ RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
+ update_alloc_size(type);
+ mutex_unlock(&nf_ct_ext_type_mutex);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 6f210f39976..b8a0924064e 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -3,35 +3,35 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - enable working with Layer 3 protocol independent connection tracking.
- * - track EPRT and EPSV commands with IPv6 address.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter.h>
#include <linux/ip.h>
+#include <linux/slab.h>
#include <linux/ipv6.h>
#include <linux/ctype.h>
+#include <linux/inet.h>
#include <net/checksum.h>
#include <net/tcp.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <linux/netfilter/nf_conntrack_ftp.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
+MODULE_ALIAS("ip_conntrack_ftp");
+MODULE_ALIAS_NFCT_HELPER("ftp");
/* This is slow, but it's simple. --RR */
static char *ftp_buffer;
@@ -43,163 +43,83 @@ static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c;
module_param_array(ports, ushort, &ports_c, 0400);
-static int loose;
+static bool loose;
module_param(loose, bool, 0600);
-unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb,
+unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
- enum ip_ct_ftp_type type,
+ enum nf_ct_ftp_type type,
+ unsigned int protoff,
unsigned int matchoff,
unsigned int matchlen,
- struct nf_conntrack_expect *exp,
- u32 *seq);
+ struct nf_conntrack_expect *exp);
EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
-static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
- char);
+ char, unsigned int *);
static struct ftp_search {
- enum ip_conntrack_dir dir;
const char *pattern;
size_t plen;
char skip;
char term;
- enum ip_ct_ftp_type ftptype;
- int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
-} search[] = {
- {
- IP_CT_DIR_ORIGINAL,
- "PORT", sizeof("PORT") - 1, ' ', '\r',
- IP_CT_FTP_PORT,
- try_rfc959,
- },
- {
- IP_CT_DIR_REPLY,
- "227 ", sizeof("227 ") - 1, '(', ')',
- IP_CT_FTP_PASV,
- try_rfc959,
- },
- {
- IP_CT_DIR_ORIGINAL,
- "EPRT", sizeof("EPRT") - 1, ' ', '\r',
- IP_CT_FTP_EPRT,
- try_eprt,
+ enum nf_ct_ftp_type ftptype;
+ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);
+} search[IP_CT_DIR_MAX][2] = {
+ [IP_CT_DIR_ORIGINAL] = {
+ {
+ .pattern = "PORT",
+ .plen = sizeof("PORT") - 1,
+ .skip = ' ',
+ .term = '\r',
+ .ftptype = NF_CT_FTP_PORT,
+ .getnum = try_rfc959,
+ },
+ {
+ .pattern = "EPRT",
+ .plen = sizeof("EPRT") - 1,
+ .skip = ' ',
+ .term = '\r',
+ .ftptype = NF_CT_FTP_EPRT,
+ .getnum = try_eprt,
+ },
},
- {
- IP_CT_DIR_REPLY,
- "229 ", sizeof("229 ") - 1, '(', ')',
- IP_CT_FTP_EPSV,
- try_epsv_response,
+ [IP_CT_DIR_REPLY] = {
+ {
+ .pattern = "227 ",
+ .plen = sizeof("227 ") - 1,
+ .ftptype = NF_CT_FTP_PASV,
+ .getnum = try_rfc1123,
+ },
+ {
+ .pattern = "229 ",
+ .plen = sizeof("229 ") - 1,
+ .skip = '(',
+ .term = ')',
+ .ftptype = NF_CT_FTP_EPSV,
+ .getnum = try_epsv_response,
+ },
},
};
-/* This code is based on inet_pton() in glibc-2.2.4 */
static int
get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
{
- static const char xdigits[] = "0123456789abcdef";
- u_int8_t tmp[16], *tp, *endp, *colonp;
- int ch, saw_xdigit;
- u_int32_t val;
- size_t clen = 0;
-
- tp = memset(tmp, '\0', sizeof(tmp));
- endp = tp + sizeof(tmp);
- colonp = NULL;
-
- /* Leading :: requires some special handling. */
- if (*src == ':'){
- if (*++src != ':') {
- DEBUGP("invalid \":\" at the head of addr\n");
- return 0;
- }
- clen++;
- }
-
- saw_xdigit = 0;
- val = 0;
- while ((clen < dlen) && (*src != term)) {
- const char *pch;
-
- ch = tolower(*src++);
- clen++;
-
- pch = strchr(xdigits, ch);
- if (pch != NULL) {
- val <<= 4;
- val |= (pch - xdigits);
- if (val > 0xffff)
- return 0;
-
- saw_xdigit = 1;
- continue;
- }
- if (ch != ':') {
- DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch);
- return 0;
- }
-
- if (!saw_xdigit) {
- if (colonp) {
- DEBUGP("invalid location of \"::\".\n");
- return 0;
- }
- colonp = tp;
- continue;
- } else if (*src == term) {
- DEBUGP("trancated IPv6 addr\n");
- return 0;
- }
-
- if (tp + 2 > endp)
- return 0;
- *tp++ = (u_int8_t) (val >> 8) & 0xff;
- *tp++ = (u_int8_t) val & 0xff;
-
- saw_xdigit = 0;
- val = 0;
- continue;
- }
- if (saw_xdigit) {
- if (tp + 2 > endp)
- return 0;
- *tp++ = (u_int8_t) (val >> 8) & 0xff;
- *tp++ = (u_int8_t) val & 0xff;
- }
- if (colonp != NULL) {
- /*
- * Since some memmove()'s erroneously fail to handle
- * overlapping regions, we'll do the shift by hand.
- */
- const int n = tp - colonp;
- int i;
-
- if (tp == endp)
- return 0;
-
- for (i = 1; i <= n; i++) {
- endp[- i] = colonp[n - i];
- colonp[n - i] = 0;
- }
- tp = endp;
- }
- if (tp != endp || (*src != term))
- return 0;
-
- memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr));
- return clen;
+ const char *end;
+ int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end);
+ if (ret > 0)
+ return (int)(end - src);
+ return 0;
}
static int try_number(const char *data, size_t dlen, u_int32_t array[],
- int array_size, char sep, char term)
+ int array_size, char sep, char term)
{
u_int32_t i, len;
@@ -214,23 +134,25 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
i++;
else {
/* Unexpected character; true if it's the
- terminator and we're finished. */
- if (*data == term && i == array_size - 1)
+ terminator (or we don't care about one)
+ and we're finished. */
+ if ((*data == term || !term) && i == array_size - 1)
return len;
- DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
- len, i, *data);
+ pr_debug("Char %u (got %u nums) `%u' unexpected\n",
+ len, i, *data);
return 0;
}
}
- DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
-
+ pr_debug("Failed to fill %u numbers separated by %c\n",
+ array_size, sep);
return 0;
}
/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
static int try_rfc959(const char *data, size_t dlen,
- struct nf_conntrack_man *cmd, char term)
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
{
int length;
u_int32_t array[6];
@@ -245,9 +167,36 @@ static int try_rfc959(const char *data, size_t dlen,
return length;
}
+/*
+ * From RFC 1123:
+ * The format of the 227 reply to a PASV command is not
+ * well standardized. In particular, an FTP client cannot
+ * assume that the parentheses shown on page 40 of RFC-959
+ * will be present (and in fact, Figure 3 on page 43 omits
+ * them). Therefore, a User-FTP program that interprets
+ * the PASV reply must scan the reply for the first digit
+ * of the host and port numbers.
+ */
+static int try_rfc1123(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
+{
+ int i;
+ for (i = 0; i < dlen; i++)
+ if (isdigit(data[i]))
+ break;
+
+ if (i == dlen)
+ return 0;
+
+ *offset += i;
+
+ return try_rfc959(data + i, dlen - i, cmd, 0, offset);
+}
+
/* Grab port: number up to delimiter */
static int get_port(const char *data, int start, size_t dlen, char delim,
- u_int16_t *port)
+ __be16 *port)
{
u_int16_t tmp_port = 0;
int i;
@@ -258,13 +207,13 @@ static int get_port(const char *data, int start, size_t dlen, char delim,
if (tmp_port == 0)
break;
*port = htons(tmp_port);
- DEBUGP("get_port: return %d\n", tmp_port);
+ pr_debug("get_port: return %d\n", tmp_port);
return i + 1;
}
else if (data[i] >= '0' && data[i] <= '9')
tmp_port = tmp_port*10 + data[i] - '0';
else { /* Some other crap */
- DEBUGP("get_port: invalid char.\n");
+ pr_debug("get_port: invalid char.\n");
break;
}
}
@@ -273,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim,
/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
- char term)
+ char term, unsigned int *offset)
{
char delim;
int length;
@@ -281,22 +230,22 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
/* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
then delimiter again. */
if (dlen <= 3) {
- DEBUGP("EPRT: too short\n");
+ pr_debug("EPRT: too short\n");
return 0;
}
delim = data[0];
if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
- DEBUGP("try_eprt: invalid delimitter.\n");
+ pr_debug("try_eprt: invalid delimitter.\n");
return 0;
}
if ((cmd->l3num == PF_INET && data[1] != '1') ||
(cmd->l3num == PF_INET6 && data[1] != '2')) {
- DEBUGP("EPRT: invalid protocol number.\n");
+ pr_debug("EPRT: invalid protocol number.\n");
return 0;
}
- DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim);
+ pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim);
if (data[1] == '1') {
u_int32_t array[4];
@@ -314,22 +263,23 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
if (length == 0)
return 0;
- DEBUGP("EPRT: Got IP address!\n");
+ pr_debug("EPRT: Got IP address!\n");
/* Start offset includes initial "|1|", and trailing delimiter */
return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
}
/* Returns 0, or length of numbers: |||6446| */
static int try_epsv_response(const char *data, size_t dlen,
- struct nf_conntrack_man *cmd, char term)
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
{
char delim;
/* Three delimiters. */
if (dlen <= 3) return 0;
delim = data[0];
- if (isdigit(delim) || delim < 33 || delim > 126
- || data[1] != delim || data[2] != delim)
+ if (isdigit(delim) || delim < 33 || delim > 126 ||
+ data[1] != delim || data[2] != delim)
return 0;
return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
@@ -343,11 +293,12 @@ static int find_pattern(const char *data, size_t dlen,
unsigned int *numlen,
struct nf_conntrack_man *cmd,
int (*getnum)(const char *, size_t,
- struct nf_conntrack_man *, char))
+ struct nf_conntrack_man *, char,
+ unsigned int *))
{
- size_t i;
+ size_t i = plen;
- DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+ pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
if (dlen == 0)
return 0;
@@ -362,38 +313,40 @@ static int find_pattern(const char *data, size_t dlen,
#if 0
size_t i;
- DEBUGP("ftp: string mismatch\n");
+ pr_debug("ftp: string mismatch\n");
for (i = 0; i < plen; i++) {
- DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
- i, data[i], data[i],
- pattern[i], pattern[i]);
+ pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+ i, data[i], data[i],
+ pattern[i], pattern[i]);
}
#endif
return 0;
}
- DEBUGP("Pattern matches!\n");
+ pr_debug("Pattern matches!\n");
/* Now we've found the constant string, try to skip
to the 'skip' character */
- for (i = plen; data[i] != skip; i++)
- if (i == dlen - 1) return -1;
+ if (skip) {
+ for (i = plen; data[i] != skip; i++)
+ if (i == dlen - 1) return -1;
- /* Skip over the last character */
- i++;
+ /* Skip over the last character */
+ i++;
+ }
- DEBUGP("Skipped up to `%c'!\n", skip);
+ pr_debug("Skipped up to `%c'!\n", skip);
*numoff = i;
- *numlen = getnum(data + i, dlen - i, cmd, term);
+ *numlen = getnum(data + i, dlen - i, cmd, term, numoff);
if (!*numlen)
return -1;
- DEBUGP("Match succeeded!\n");
+ pr_debug("Match succeeded!\n");
return 1;
}
/* Look up to see if we're just after a \n. */
-static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
+static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir)
{
unsigned int i;
@@ -404,71 +357,74 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
}
/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+static void update_nl_seq(struct nf_conn *ct, u32 nl_seq,
+ struct nf_ct_ftp_master *info, int dir,
struct sk_buff *skb)
{
- unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
+ unsigned int i, oldest;
/* Look for oldest: if we find exact match, we're done. */
for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
if (info->seq_aft_nl[dir][i] == nl_seq)
return;
-
- if (oldest == info->seq_aft_nl_num[dir]
- || before(info->seq_aft_nl[dir][i], oldest))
- oldest = i;
}
if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
- } else if (oldest != NUM_SEQ_TO_REMEMBER) {
- info->seq_aft_nl[dir][oldest] = nl_seq;
- nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else {
+ if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1]))
+ oldest = 0;
+ else
+ oldest = 1;
+
+ if (after(nl_seq, info->seq_aft_nl[dir][oldest]))
+ info->seq_aft_nl[dir][oldest] = nl_seq;
}
}
-static int help(struct sk_buff **pskb,
+static int help(struct sk_buff *skb,
unsigned int protoff,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
unsigned int dataoff, datalen;
- struct tcphdr _tcph, *th;
- char *fb_ptr;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ const char *fb_ptr;
int ret;
u32 seq;
int dir = CTINFO2DIR(ctinfo);
- unsigned int matchlen, matchoff;
- struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info;
+ unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+ struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
+ union nf_inet_addr *daddr;
struct nf_conntrack_man cmd = {};
-
unsigned int i;
int found = 0, ends_in_nl;
+ typeof(nf_nat_ftp_hook) nf_nat_ftp;
/* Until there's been traffic both ways, don't look in packets. */
- if (ctinfo != IP_CT_ESTABLISHED
- && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
- DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) {
+ pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
return NF_ACCEPT;
}
- th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph);
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
return NF_ACCEPT;
dataoff = protoff + th->doff * 4;
/* No data? */
- if (dataoff >= (*pskb)->len) {
- DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
- (*pskb)->len);
+ if (dataoff >= skb->len) {
+ pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
+ skb->len);
return NF_ACCEPT;
}
- datalen = (*pskb)->len - dataoff;
+ datalen = skb->len - dataoff;
spin_lock_bh(&nf_ftp_lock);
- fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer);
+ fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
BUG_ON(fb_ptr == NULL);
ends_in_nl = (fb_ptr[datalen - 1] == '\n');
@@ -476,33 +432,38 @@ static int help(struct sk_buff **pskb,
/* Look up to see if we're just after a \n. */
if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+ /* We're picking up this, clear flags and let it continue */
+ if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) {
+ ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP;
+ goto skip_nl_seq;
+ }
+
/* Now if this ends in \n, update ftp info. */
- DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
- ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
- ct_ftp_info->seq_aft_nl[dir][0],
- ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
- ct_ftp_info->seq_aft_nl[dir][1]);
+ pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
+ ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][0],
+ ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][1]);
ret = NF_ACCEPT;
goto out_update_nl;
}
- /* Initialize IP/IPv6 addr to expected address (it's not mentioned
- in EPSV responses) */
- cmd.l3num = ct->tuplehash[dir].tuple.src.l3num;
+skip_nl_seq:
+ /* Initialize IP/IPv6 addr to expected address (it's not mentioned
+ in EPSV responses) */
+ cmd.l3num = nf_ct_l3num(ct);
memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
sizeof(cmd.u3.all));
- for (i = 0; i < ARRAY_SIZE(search); i++) {
- if (search[i].dir != dir) continue;
-
+ for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
found = find_pattern(fb_ptr, datalen,
- search[i].pattern,
- search[i].plen,
- search[i].skip,
- search[i].term,
+ search[dir][i].pattern,
+ search[dir][i].plen,
+ search[dir][i].skip,
+ search[dir][i].term,
&matchoff, &matchlen,
&cmd,
- search[i].getnum);
+ search[dir][i].getnum);
if (found) break;
}
if (found == -1) {
@@ -510,10 +471,8 @@ static int help(struct sk_buff **pskb,
connection tracking, not packet filtering.
However, it is necessary for accurate tracking in
this case. */
- if (net_ratelimit())
- printk("conntrack_ftp: partial %s %u+%u\n",
- search[i].pattern,
- ntohl(th->seq), datalen);
+ nf_ct_helper_log(skb, ct, "partial matching of `%s'",
+ search[dir][i].pattern);
ret = NF_DROP;
goto out;
} else if (found == 0) { /* No match */
@@ -521,12 +480,13 @@ static int help(struct sk_buff **pskb,
goto out_update_nl;
}
- DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
- (int)matchlen, fb_ptr + matchoff,
- matchlen, ntohl(th->seq) + matchoff);
+ pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+ matchlen, fb_ptr + matchoff,
+ matchlen, ntohl(th->seq) + matchoff);
- exp = nf_conntrack_expect_alloc(ct);
+ exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
ret = NF_DROP;
goto out;
}
@@ -534,25 +494,24 @@ static int help(struct sk_buff **pskb,
/* We refer to the reverse direction ("!dir") tuples here,
* because we're expecting something in the other direction.
* Doesn't matter unless NAT is happening. */
- exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3;
+ daddr = &ct->tuplehash[!dir].tuple.dst.u3;
/* Update the ftp info */
- if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) &&
+ if ((cmd.l3num == nf_ct_l3num(ct)) &&
memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
sizeof(cmd.u3.all))) {
/* Enrico Scholz's passive FTP to partially RNAT'd ftp
- server: it really wants us to connect to a
- different IP address. Simply don't record it for
- NAT. */
+ server: it really wants us to connect to a
+ different IP address. Simply don't record it for
+ NAT. */
if (cmd.l3num == PF_INET) {
- DEBUGP("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT " != " NIPQUAD_FMT "\n",
- NIPQUAD(cmd.u3.ip),
- NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
+ pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n",
+ &cmd.u3.ip,
+ &ct->tuplehash[dir].tuple.src.u3.ip);
} else {
- DEBUGP("conntrack_ftp: NOT RECORDING: " NIP6_FMT " != " NIP6_FMT "\n",
- NIP6(*((struct in6_addr *)cmd.u3.ip6)),
- NIP6(*((struct in6_addr *)ct->tuplehash[dir]
- .tuple.src.u3.ip6)));
+ pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
+ cmd.u3.ip6,
+ ct->tuplehash[dir].tuple.src.u3.ip6);
}
/* Thanks to Cristiano Lincoln Mattos
@@ -563,68 +522,63 @@ static int help(struct sk_buff **pskb,
ret = NF_ACCEPT;
goto out_put_expect;
}
- memcpy(&exp->tuple.dst.u3, &cmd.u3.all,
- sizeof(exp->tuple.dst.u3));
- }
-
- exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3;
- exp->tuple.src.l3num = cmd.l3num;
- exp->tuple.src.u.tcp.port = 0;
- exp->tuple.dst.u.tcp.port = cmd.u.tcp.port;
- exp->tuple.dst.protonum = IPPROTO_TCP;
-
- exp->mask = (struct nf_conntrack_tuple)
- { .src = { .l3num = 0xFFFF,
- .u = { .tcp = { 0 }},
- },
- .dst = { .protonum = 0xFF,
- .u = { .tcp = { 0xFFFF }},
- },
- };
- if (cmd.l3num == PF_INET) {
- exp->mask.src.u3.ip = 0xFFFFFFFF;
- exp->mask.dst.u3.ip = 0xFFFFFFFF;
- } else {
- memset(exp->mask.src.u3.ip6, 0xFF,
- sizeof(exp->mask.src.u3.ip6));
- memset(exp->mask.dst.u3.ip6, 0xFF,
- sizeof(exp->mask.src.u3.ip6));
+ daddr = &cmd.u3;
}
- exp->expectfn = NULL;
- exp->flags = 0;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
+ &ct->tuplehash[!dir].tuple.src.u3, daddr,
+ IPPROTO_TCP, NULL, &cmd.u.tcp.port);
/* Now, NAT might want to mangle the packet, and register the
* (possibly changed) expectation itself. */
- if (nf_nat_ftp_hook)
- ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
- matchoff, matchlen, exp, &seq);
+ nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
+ if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
+ protoff, matchoff, matchlen, exp);
else {
/* Can't expect this? Best to drop packet now. */
- if (nf_conntrack_expect_related(exp) != 0)
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
- else
+ } else
ret = NF_ACCEPT;
}
out_put_expect:
- nf_conntrack_expect_put(exp);
+ nf_ct_expect_put(exp);
out_update_nl:
/* Now if this ends in \n, update ftp info. Seq may have been
* adjusted by NAT code. */
if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info, dir, *pskb);
+ update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
out:
spin_unlock_bh(&nf_ftp_lock);
return ret;
}
-static struct nf_conntrack_helper ftp[MAX_PORTS][2];
-static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")];
+static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
+
+ /* This conntrack has been injected from user-space, always pick up
+ * sequence tracking. Otherwise, the first FTP command after the
+ * failover breaks.
+ */
+ ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP;
+ ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP;
+ return 0;
+}
+
+static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy ftp_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
/* don't make this __exit, since it's called from __init ! */
-static void fini(void)
+static void nf_conntrack_ftp_fini(void)
{
int i, j;
for (i = 0; i < ports_c; i++) {
@@ -632,9 +586,9 @@ static void fini(void)
if (ftp[i][j].me == NULL)
continue;
- DEBUGP("nf_ct_ftp: unregistering helper for pf: %d "
- "port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
+ pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_helper_unregister(&ftp[i][j]);
}
}
@@ -642,10 +596,9 @@ static void fini(void)
kfree(ftp_buffer);
}
-static int __init init(void)
+static int __init nf_conntrack_ftp_init(void)
{
int i, j = -1, ret = 0;
- char *tmpname;
ftp_buffer = kmalloc(65536, GFP_KERNEL);
if (!ftp_buffer)
@@ -660,30 +613,27 @@ static int __init init(void)
ftp[i][0].tuple.src.l3num = PF_INET;
ftp[i][1].tuple.src.l3num = PF_INET6;
for (j = 0; j < 2; j++) {
+ ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
- ftp[i][j].mask.src.u.tcp.port = 0xFFFF;
- ftp[i][j].mask.dst.protonum = 0xFF;
- ftp[i][j].max_expected = 1;
- ftp[i][j].timeout = 5 * 60; /* 5 Minutes */
+ ftp[i][j].expect_policy = &ftp_exp_policy;
ftp[i][j].me = THIS_MODULE;
ftp[i][j].help = help;
- tmpname = &ftp_names[i][j][0];
+ ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
if (ports[i] == FTP_PORT)
- sprintf(tmpname, "ftp");
+ sprintf(ftp[i][j].name, "ftp");
else
- sprintf(tmpname, "ftp-%d", ports[i]);
- ftp[i][j].name = tmpname;
+ sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
- DEBUGP("nf_ct_ftp: registering helper for pf: %d "
- "port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
+ pr_debug("nf_ct_ftp: registering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
ret = nf_conntrack_helper_register(&ftp[i][j]);
if (ret) {
- printk("nf_ct_ftp: failed to register helper "
- " for pf: %d port: %d\n",
+ printk(KERN_ERR "nf_ct_ftp: failed to register"
+ " helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
- fini();
+ nf_conntrack_ftp_fini();
return ret;
}
}
@@ -692,5 +642,5 @@ static int __init init(void)
return 0;
}
-module_init(init);
-module_exit(fini);
+module_init(nf_conntrack_ftp_init);
+module_exit(nf_conntrack_ftp_fini);
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
new file mode 100644
index 00000000000..bcd5ed6b713
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -0,0 +1,888 @@
+/****************************************************************************
+ * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
+ * conntrack/NAT module.
+ *
+ * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * See ip_conntrack_helper_h323_asn1.h for details.
+ *
+ ****************************************************************************/
+
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#else
+#include <stdio.h>
+#endif
+#include <linux/netfilter/nf_conntrack_h323_asn1.h>
+
+/* Trace Flag */
+#ifndef H323_TRACE
+#define H323_TRACE 0
+#endif
+
+#if H323_TRACE
+#define TAB_SIZE 4
+#define IFTHEN(cond, act) if(cond){act;}
+#ifdef __KERNEL__
+#define PRINT printk
+#else
+#define PRINT printf
+#endif
+#define FNAME(name) name,
+#else
+#define IFTHEN(cond, act)
+#define PRINT(fmt, args...)
+#define FNAME(name)
+#endif
+
+/* ASN.1 Types */
+#define NUL 0
+#define BOOL 1
+#define OID 2
+#define INT 3
+#define ENUM 4
+#define BITSTR 5
+#define NUMSTR 6
+#define NUMDGT 6
+#define TBCDSTR 6
+#define OCTSTR 7
+#define PRTSTR 7
+#define IA5STR 7
+#define GENSTR 7
+#define BMPSTR 8
+#define SEQ 9
+#define SET 9
+#define SEQOF 10
+#define SETOF 10
+#define CHOICE 11
+
+/* Constraint Types */
+#define FIXD 0
+/* #define BITS 1-8 */
+#define BYTE 9
+#define WORD 10
+#define CONS 11
+#define SEMI 12
+#define UNCO 13
+
+/* ASN.1 Type Attributes */
+#define SKIP 0
+#define STOP 1
+#define DECODE 2
+#define EXT 4
+#define OPEN 8
+#define OPT 16
+
+
+/* ASN.1 Field Structure */
+typedef struct field_t {
+#if H323_TRACE
+ char *name;
+#endif
+ unsigned char type;
+ unsigned char sz;
+ unsigned char lb;
+ unsigned char ub;
+ unsigned short attr;
+ unsigned short offset;
+ const struct field_t *fields;
+} field_t;
+
+/* Bit Stream */
+typedef struct {
+ unsigned char *buf;
+ unsigned char *beg;
+ unsigned char *end;
+ unsigned char *cur;
+ unsigned int bit;
+} bitstr_t;
+
+/* Tool Functions */
+#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;}
+#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;}
+#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;}
+#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND)
+static unsigned int get_len(bitstr_t *bs);
+static unsigned int get_bit(bitstr_t *bs);
+static unsigned int get_bits(bitstr_t *bs, unsigned int b);
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b);
+static unsigned int get_uint(bitstr_t *bs, int b);
+
+/* Decoder Functions */
+static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level);
+
+/* Decoder Functions Vector */
+typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int);
+static const decoder_t Decoders[] = {
+ decode_nul,
+ decode_bool,
+ decode_oid,
+ decode_int,
+ decode_enum,
+ decode_bitstr,
+ decode_numstr,
+ decode_octstr,
+ decode_bmpstr,
+ decode_seq,
+ decode_seqof,
+ decode_choice,
+};
+
+/****************************************************************************
+ * H.323 Types
+ ****************************************************************************/
+#include "nf_conntrack_h323_types.c"
+
+/****************************************************************************
+ * Functions
+ ****************************************************************************/
+/* Assume bs is aligned && v < 16384 */
+static unsigned int get_len(bitstr_t *bs)
+{
+ unsigned int v;
+
+ v = *bs->cur++;
+
+ if (v & 0x80) {
+ v &= 0x3f;
+ v <<= 8;
+ v += *bs->cur++;
+ }
+
+ return v;
+}
+
+/****************************************************************************/
+static unsigned int get_bit(bitstr_t *bs)
+{
+ unsigned int b = (*bs->cur) & (0x80 >> bs->bit);
+
+ INC_BIT(bs);
+
+ return b;
+}
+
+/****************************************************************************/
+/* Assume b <= 8 */
+static unsigned int get_bits(bitstr_t *bs, unsigned int b)
+{
+ unsigned int v, l;
+
+ v = (*bs->cur) & (0xffU >> bs->bit);
+ l = b + bs->bit;
+
+ if (l < 8) {
+ v >>= 8 - l;
+ bs->bit = l;
+ } else if (l == 8) {
+ bs->cur++;
+ bs->bit = 0;
+ } else { /* l > 8 */
+
+ v <<= 8;
+ v += *(++bs->cur);
+ v >>= 16 - l;
+ bs->bit = l - 8;
+ }
+
+ return v;
+}
+
+/****************************************************************************/
+/* Assume b <= 32 */
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b)
+{
+ unsigned int v, l, shift, bytes;
+
+ if (!b)
+ return 0;
+
+ l = bs->bit + b;
+
+ if (l < 8) {
+ v = (unsigned int)(*bs->cur) << (bs->bit + 24);
+ bs->bit = l;
+ } else if (l == 8) {
+ v = (unsigned int)(*bs->cur++) << (bs->bit + 24);
+ bs->bit = 0;
+ } else {
+ for (bytes = l >> 3, shift = 24, v = 0; bytes;
+ bytes--, shift -= 8)
+ v |= (unsigned int)(*bs->cur++) << shift;
+
+ if (l < 32) {
+ v |= (unsigned int)(*bs->cur) << shift;
+ v <<= bs->bit;
+ } else if (l > 32) {
+ v <<= bs->bit;
+ v |= (*bs->cur) >> (8 - bs->bit);
+ }
+
+ bs->bit = l & 0x7;
+ }
+
+ v &= 0xffffffff << (32 - b);
+
+ return v;
+}
+
+/****************************************************************************
+ * Assume bs is aligned and sizeof(unsigned int) == 4
+ ****************************************************************************/
+static unsigned int get_uint(bitstr_t *bs, int b)
+{
+ unsigned int v = 0;
+
+ switch (b) {
+ case 4:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 3:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 2:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 1:
+ v |= *bs->cur++;
+ break;
+ }
+ return v;
+}
+
+/****************************************************************************/
+static int decode_nul(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bool(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ INC_BIT(bs);
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_oid(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = *bs->cur++;
+ bs->cur += len;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_int(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ bs->cur++;
+ break;
+ case WORD: /* 257 <= Range <= 64K */
+ BYTE_ALIGN(bs);
+ bs->cur += 2;
+ break;
+ case CONS: /* 64K < Range < 4G */
+ len = get_bits(bs, 2) + 1;
+ BYTE_ALIGN(bs);
+ if (base && (f->attr & DECODE)) { /* timeToLive */
+ unsigned int v = get_uint(bs, len) + f->lb;
+ PRINT(" = %u", v);
+ *((unsigned int *)(base + f->offset)) = v;
+ }
+ bs->cur += len;
+ break;
+ case UNCO:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ bs->cur += len;
+ break;
+ default: /* 2 <= Range <= 255 */
+ INC_BITS(bs, f->sz);
+ break;
+ }
+
+ PRINT("\n");
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_enum(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ if ((f->attr & EXT) && get_bit(bs)) {
+ INC_BITS(bs, 7);
+ } else {
+ INC_BITS(bs, f->sz);
+ }
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ BYTE_ALIGN(bs);
+ switch (f->sz) {
+ case FIXD: /* fixed length > 16 */
+ len = f->lb;
+ break;
+ case WORD: /* 2-byte length */
+ CHECK_BOUND(bs, 2);
+ len = (*bs->cur++) << 8;
+ len += (*bs->cur++) + f->lb;
+ break;
+ case SEMI:
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ break;
+ default:
+ len = 0;
+ break;
+ }
+
+ bs->cur += len >> 3;
+ bs->bit = len & 7;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_numstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+
+ BYTE_ALIGN(bs);
+ INC_BITS(bs, (len << 2));
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_octstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case FIXD: /* Range == 1 */
+ if (f->lb > 2) {
+ BYTE_ALIGN(bs);
+ if (base && (f->attr & DECODE)) {
+ /* The IP Address */
+ IFTHEN(f->lb == 4,
+ PRINT(" = %d.%d.%d.%d:%d",
+ bs->cur[0], bs->cur[1],
+ bs->cur[2], bs->cur[3],
+ bs->cur[4] * 256 + bs->cur[5]));
+ *((unsigned int *)(base + f->offset)) =
+ bs->cur - bs->buf;
+ }
+ }
+ len = f->lb;
+ break;
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = (*bs->cur++) + f->lb;
+ break;
+ case SEMI:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs) + f->lb;
+ break;
+ default: /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+ BYTE_ALIGN(bs);
+ break;
+ }
+
+ bs->cur += len;
+
+ PRINT("\n");
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = (*bs->cur++) + f->lb;
+ break;
+ default: /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+ BYTE_ALIGN(bs);
+ break;
+ }
+
+ bs->cur += len << 1;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seq(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Extensible? */
+ ext = (f->attr & EXT) ? get_bit(bs) : 0;
+
+ /* Get fields bitmap */
+ bmp = get_bitmap(bs, f->sz);
+ if (base)
+ *(unsigned int *)base = bmp;
+
+ /* Decode the root components */
+ for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) {
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (son->attr & OPT) { /* Optional component */
+ if (!((0x80000000U >> (opt++)) & bmp)) /* Not exist */
+ continue;
+ }
+
+ /* Decode */
+ if (son->attr & OPEN) { /* Open field */
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+ " ", son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ /* Decode */
+ if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+ }
+
+ /* No extension? */
+ if (!ext)
+ return H323_ERROR_NONE;
+
+ /* Get the extension bitmap */
+ bmp2_len = get_bits(bs, 7) + 1;
+ CHECK_BOUND(bs, (bmp2_len + 7) >> 3);
+ bmp2 = get_bitmap(bs, bmp2_len);
+ bmp |= bmp2 >> f->sz;
+ if (base)
+ *(unsigned int *)base = bmp;
+ BYTE_ALIGN(bs);
+
+ /* Decode the extension components */
+ for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
+ /* Check Range */
+ if (i >= f->ub) { /* Newer Version? */
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ bs->cur += len;
+ continue;
+ }
+
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (!((0x80000000 >> opt) & bmp2)) /* Not present */
+ continue;
+
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ }
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seqof(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int count, effective_count = 0, i, len = 0;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Decode item count */
+ switch (f->sz) {
+ case BYTE:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ count = *bs->cur++;
+ break;
+ case WORD:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ count = *bs->cur++;
+ count <<= 8;
+ count += *bs->cur++;
+ break;
+ case SEMI:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ count = get_len(bs);
+ break;
+ default:
+ count = get_bits(bs, f->sz);
+ break;
+ }
+ count += f->lb;
+
+ /* Write Count */
+ if (base) {
+ effective_count = count > f->ub ? f->ub : count;
+ *(unsigned int *)base = effective_count;
+ base += sizeof(unsigned int);
+ }
+
+ /* Decode nested field */
+ son = f->fields;
+ if (base)
+ base -= son->offset;
+ for (i = 0; i < count; i++) {
+ if (son->attr & OPEN) {
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+ " ", son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son,
+ i <
+ effective_count ?
+ base : NULL,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else
+ if ((err = (Decoders[son->type]) (bs, son,
+ i <
+ effective_count ?
+ base : NULL,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ if (base)
+ base += son->offset;
+ }
+
+ return H323_ERROR_NONE;
+}
+
+
+/****************************************************************************/
+static int decode_choice(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int type, ext, len = 0;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Decode the choice index number */
+ if ((f->attr & EXT) && get_bit(bs)) {
+ ext = 1;
+ type = get_bits(bs, 7) + f->lb;
+ } else {
+ ext = 0;
+ type = get_bits(bs, f->sz);
+ if (type >= f->lb)
+ return H323_ERROR_RANGE;
+ }
+
+ /* Write Type */
+ if (base)
+ *(unsigned int *)base = type;
+
+ /* Check Range */
+ if (type >= f->ub) { /* Newer version? */
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ bs->cur += len;
+ return H323_ERROR_NONE;
+ }
+
+ /* Transfer to son level */
+ son = &f->fields[type];
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (ext || (son->attr & OPEN)) {
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ bs->cur += len;
+ return H323_ERROR_NONE;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras)
+{
+ static const struct field_t ras_message = {
+ FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT,
+ 0, _RasMessage
+ };
+ bitstr_t bs;
+
+ bs.buf = bs.beg = bs.cur = buf;
+ bs.end = buf + sz;
+ bs.bit = 0;
+
+ return decode_choice(&bs, &ras_message, (char *) ras, 0);
+}
+
+/****************************************************************************/
+static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
+ size_t sz, H323_UserInformation *uuie)
+{
+ static const struct field_t h323_userinformation = {
+ FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT,
+ 0, _H323_UserInformation
+ };
+ bitstr_t bs;
+
+ bs.buf = buf;
+ bs.beg = bs.cur = beg;
+ bs.end = beg + sz;
+ bs.bit = 0;
+
+ return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0);
+}
+
+/****************************************************************************/
+int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
+ MultimediaSystemControlMessage *
+ mscm)
+{
+ static const struct field_t multimediasystemcontrolmessage = {
+ FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4,
+ DECODE | EXT, 0, _MultimediaSystemControlMessage
+ };
+ bitstr_t bs;
+
+ bs.buf = bs.beg = bs.cur = buf;
+ bs.end = buf + sz;
+ bs.bit = 0;
+
+ return decode_choice(&bs, &multimediasystemcontrolmessage,
+ (char *) mscm, 0);
+}
+
+/****************************************************************************/
+int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931)
+{
+ unsigned char *p = buf;
+ int len;
+
+ if (!p || sz < 1)
+ return H323_ERROR_BOUND;
+
+ /* Protocol Discriminator */
+ if (*p != 0x08) {
+ PRINT("Unknown Protocol Discriminator\n");
+ return H323_ERROR_RANGE;
+ }
+ p++;
+ sz--;
+
+ /* CallReferenceValue */
+ if (sz < 1)
+ return H323_ERROR_BOUND;
+ len = *p++;
+ sz--;
+ if (sz < len)
+ return H323_ERROR_BOUND;
+ p += len;
+ sz -= len;
+
+ /* Message Type */
+ if (sz < 1)
+ return H323_ERROR_BOUND;
+ q931->MessageType = *p++;
+ PRINT("MessageType = %02X\n", q931->MessageType);
+ if (*p & 0x80) {
+ p++;
+ sz--;
+ }
+
+ /* Decode Information Elements */
+ while (sz > 0) {
+ if (*p == 0x7e) { /* UserUserIE */
+ if (sz < 3)
+ break;
+ p++;
+ len = *p++ << 8;
+ len |= *p++;
+ sz -= 3;
+ if (sz < len)
+ break;
+ p++;
+ len--;
+ return DecodeH323_UserInformation(buf, p, len,
+ &q931->UUIE);
+ }
+ p++;
+ sz--;
+ if (sz < 1)
+ break;
+ len = *p++;
+ if (sz < len)
+ break;
+ p += len;
+ sz -= len;
+ }
+
+ PRINT("Q.931 UUIE not found\n");
+
+ return H323_ERROR_BOUND;
+}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
new file mode 100644
index 00000000000..3a3a60b126e
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -0,0 +1,1904 @@
+/*
+ * H.323 connection tracking helper
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 connection tracking module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * For more information, please see http://nath323.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/* Parameters */
+static unsigned int default_rrq_ttl __read_mostly = 300;
+module_param(default_rrq_ttl, uint, 0600);
+MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ");
+
+static int gkrouted_only __read_mostly = 1;
+module_param(gkrouted_only, int, 0600);
+MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
+
+static bool callforward_filter __read_mostly = true;
+module_param(callforward_filter, bool, 0600);
+MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
+ "if both endpoints are on different sides "
+ "(determined by routing information)");
+
+/* Hooks for NAT */
+int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+ __read_mostly;
+int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+ __read_mostly;
+int (*set_sig_addr_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count) __read_mostly;
+int (*set_ras_addr_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count) __read_mostly;
+int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ __be16 port, __be16 rtp_port,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp) __read_mostly;
+int (*nat_t120_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_h245_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_callforwarding_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_q931_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, TransportAddress *taddr, int idx,
+ __be16 port, struct nf_conntrack_expect *exp)
+ __read_mostly;
+
+static DEFINE_SPINLOCK(nf_h323_lock);
+static char *h323_buffer;
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245;
+static struct nf_conntrack_helper nf_conntrack_helper_q931[];
+static struct nf_conntrack_helper nf_conntrack_helper_ras[];
+
+/****************************************************************************/
+static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned char **data, int *datalen, int *dataoff)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ int tcpdatalen;
+ int tcpdataoff;
+ unsigned char *tpkt;
+ int tpktlen;
+ int tpktoff;
+
+ /* Get TCP header */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+
+ /* Get TCP data offset */
+ tcpdataoff = protoff + th->doff * 4;
+
+ /* Get TCP data length */
+ tcpdatalen = skb->len - tcpdataoff;
+ if (tcpdatalen <= 0) /* No TCP data */
+ goto clear_out;
+
+ if (*data == NULL) { /* first TPKT */
+ /* Get first TPKT pointer */
+ tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
+ h323_buffer);
+ BUG_ON(tpkt == NULL);
+
+ /* Validate TPKT identifier */
+ if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
+ /* Netmeeting sends TPKT header and data separately */
+ if (info->tpkt_len[dir] > 0) {
+ pr_debug("nf_ct_h323: previous packet "
+ "indicated separate TPKT data of %hu "
+ "bytes\n", info->tpkt_len[dir]);
+ if (info->tpkt_len[dir] <= tcpdatalen) {
+ /* Yes, there was a TPKT header
+ * received */
+ *data = tpkt;
+ *datalen = info->tpkt_len[dir];
+ *dataoff = 0;
+ goto out;
+ }
+
+ /* Fragmented TPKT */
+ pr_debug("nf_ct_h323: fragmented TPKT\n");
+ goto clear_out;
+ }
+
+ /* It is not even a TPKT */
+ return 0;
+ }
+ tpktoff = 0;
+ } else { /* Next TPKT */
+ tpktoff = *dataoff + *datalen;
+ tcpdatalen -= tpktoff;
+ if (tcpdatalen <= 4) /* No more TPKT */
+ goto clear_out;
+ tpkt = *data + *datalen;
+
+ /* Validate TPKT identifier */
+ if (tpkt[0] != 0x03 || tpkt[1] != 0)
+ goto clear_out;
+ }
+
+ /* Validate TPKT length */
+ tpktlen = tpkt[2] * 256 + tpkt[3];
+ if (tpktlen < 4)
+ goto clear_out;
+ if (tpktlen > tcpdatalen) {
+ if (tcpdatalen == 4) { /* Separate TPKT header */
+ /* Netmeeting sends TPKT header and data separately */
+ pr_debug("nf_ct_h323: separate TPKT header indicates "
+ "there will be TPKT data of %hu bytes\n",
+ tpktlen - 4);
+ info->tpkt_len[dir] = tpktlen - 4;
+ return 0;
+ }
+
+ pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n");
+ goto clear_out;
+ }
+
+ /* This is the encapsulated data */
+ *data = tpkt + 4;
+ *datalen = tpktlen - 4;
+ *dataoff = tpktoff + 4;
+
+ out:
+ /* Clear TPKT length */
+ info->tpkt_len[dir] = 0;
+ return 1;
+
+ clear_out:
+ info->tpkt_len[dir] = 0;
+ return 0;
+}
+
+/****************************************************************************/
+static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const unsigned char *p;
+ int len;
+
+ if (taddr->choice != eH245_TransportAddress_unicastAddress)
+ return 0;
+
+ switch (taddr->unicastAddress.choice) {
+ case eUnicastAddress_iPAddress:
+ if (nf_ct_l3num(ct) != AF_INET)
+ return 0;
+ p = data + taddr->unicastAddress.iPAddress.network;
+ len = 4;
+ break;
+ case eUnicastAddress_iP6Address:
+ if (nf_ct_l3num(ct) != AF_INET6)
+ return 0;
+ p = data + taddr->unicastAddress.iP6Address.network;
+ len = 16;
+ break;
+ default:
+ return 0;
+ }
+
+ memcpy(addr, p, len);
+ memset((void *)addr + len, 0, sizeof(*addr) - len);
+ memcpy(port, p + len, sizeof(__be16));
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ __be16 rtp_port, rtcp_port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *rtp_exp;
+ struct nf_conntrack_expect *rtcp_exp;
+ typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
+
+ /* Read RTP or RTCP address */
+ if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* RTP port is even */
+ rtp_port = port & ~htons(1);
+ rtcp_port = port | htons(1);
+
+ /* Create expect for RTP */
+ if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_UDP, NULL, &rtp_port);
+
+ /* Create expect for RTCP */
+ if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) {
+ nf_ct_expect_put(rtp_exp);
+ return -1;
+ }
+ nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_UDP, NULL, &rtcp_port);
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(rtp_exp) == 0) {
+ if (nf_ct_expect_related(rtcp_exp) == 0) {
+ pr_debug("nf_ct_h323: expect RTP ");
+ nf_ct_dump_tuple(&rtp_exp->tuple);
+ pr_debug("nf_ct_h323: expect RTCP ");
+ nf_ct_dump_tuple(&rtcp_exp->tuple);
+ } else {
+ nf_ct_unexpect_related(rtp_exp);
+ ret = -1;
+ }
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(rtp_exp);
+ nf_ct_expect_put(rtcp_exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int expect_t120(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_t120_hook) nat_t120;
+
+ /* Read T.120 address */
+ if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* Create expect for T.120 connections */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
+ port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_h323: expect T.120 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_h245_channel(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H2250LogicalChannelParameters *channel)
+{
+ int ret;
+
+ if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
+ /* RTP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ &channel->mediaChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (channel->
+ options & eH2250LogicalChannelParameters_mediaControlChannel) {
+ /* RTCP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ &channel->mediaControlChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ OpenLogicalChannel *olc)
+{
+ int ret;
+
+ pr_debug("nf_ct_h323: OpenLogicalChannel\n");
+
+ if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
+ eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
+ {
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olc->
+ forwardLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olc->options &
+ eOpenLogicalChannel_reverseLogicalChannelParameters) &&
+ (olc->reverseLogicalChannelParameters.options &
+ eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
+ && (olc->reverseLogicalChannelParameters.multiplexParameters.
+ choice ==
+ eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+ {
+ ret =
+ process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olc->
+ reverseLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olc->options & eOpenLogicalChannel_separateStack) &&
+ olc->forwardLogicalChannelParameters.dataType.choice ==
+ eDataType_data &&
+ olc->forwardLogicalChannelParameters.dataType.data.application.
+ choice == eDataApplicationCapability_application_t120 &&
+ olc->forwardLogicalChannelParameters.dataType.data.application.
+ t120.choice == eDataProtocolCapability_separateLANStack &&
+ olc->separateStack.networkAddress.choice ==
+ eNetworkAccessParameters_networkAddress_localAreaAddress) {
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
+ &olc->separateStack.networkAddress.
+ localAreaAddress);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ OpenLogicalChannelAck *olca)
+{
+ H2250LogicalChannelAckParameters *ack;
+ int ret;
+
+ pr_debug("nf_ct_h323: OpenLogicalChannelAck\n");
+
+ if ((olca->options &
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
+ (olca->reverseLogicalChannelParameters.options &
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
+ && (olca->reverseLogicalChannelParameters.multiplexParameters.
+ choice ==
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+ {
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olca->
+ reverseLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olca->options &
+ eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
+ (olca->forwardMultiplexAckParameters.choice ==
+ eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
+ {
+ ack = &olca->forwardMultiplexAckParameters.
+ h2250LogicalChannelAckParameters;
+ if (ack->options &
+ eH2250LogicalChannelAckParameters_mediaChannel) {
+ /* RTP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &ack->mediaChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (ack->options &
+ eH2250LogicalChannelAckParameters_mediaControlChannel) {
+ /* RTCP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &ack->mediaControlChannel);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ if ((olca->options & eOpenLogicalChannelAck_separateStack) &&
+ olca->separateStack.networkAddress.choice ==
+ eNetworkAccessParameters_networkAddress_localAreaAddress) {
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
+ &olca->separateStack.networkAddress.
+ localAreaAddress);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ MultimediaSystemControlMessage *mscm)
+{
+ switch (mscm->choice) {
+ case eMultimediaSystemControlMessage_request:
+ if (mscm->request.choice ==
+ eRequestMessage_openLogicalChannel) {
+ return process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &mscm->request.openLogicalChannel);
+ }
+ pr_debug("nf_ct_h323: H.245 Request %d\n",
+ mscm->request.choice);
+ break;
+ case eMultimediaSystemControlMessage_response:
+ if (mscm->response.choice ==
+ eResponseMessage_openLogicalChannelAck) {
+ return process_olca(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &mscm->response.
+ openLogicalChannelAck);
+ }
+ pr_debug("nf_ct_h323: H.245 Response %d\n",
+ mscm->response.choice);
+ break;
+ default:
+ pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice);
+ break;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int h245_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static MultimediaSystemControlMessage mscm;
+ unsigned char *data = NULL;
+ int datalen;
+ int dataoff;
+ int ret;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ pr_debug("nf_ct_h245: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Process each TPKT */
+ while (get_tpkt_data(skb, protoff, ct, ctinfo,
+ &data, &datalen, &dataoff)) {
+ pr_debug("nf_ct_h245: TPKT len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode H.245 signal */
+ ret = DecodeMultimediaSystemControlMessage(data, datalen,
+ &mscm);
+ if (ret < 0) {
+ pr_debug("nf_ct_h245: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ /* We don't drop when decoding error */
+ break;
+ }
+
+ /* Process H.245 signal */
+ if (process_h245(skb, ct, ctinfo, protoff,
+ &data, dataoff, &mscm) < 0)
+ goto drop;
+ }
+
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process H.245 message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy h245_exp_policy = {
+ .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
+ .name = "H.245",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_UNSPEC,
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = h245_help,
+ .expect_policy = &h245_exp_policy,
+};
+
+/****************************************************************************/
+int get_h225_addr(struct nf_conn *ct, unsigned char *data,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const unsigned char *p;
+ int len;
+
+ switch (taddr->choice) {
+ case eTransportAddress_ipAddress:
+ if (nf_ct_l3num(ct) != AF_INET)
+ return 0;
+ p = data + taddr->ipAddress.ip;
+ len = 4;
+ break;
+ case eTransportAddress_ip6Address:
+ if (nf_ct_l3num(ct) != AF_INET6)
+ return 0;
+ p = data + taddr->ip6Address.ip;
+ len = 16;
+ break;
+ default:
+ return 0;
+ }
+
+ memcpy(addr, p, len);
+ memset((void *)addr + len, 0, sizeof(*addr) - len);
+ memcpy(port, p + len, sizeof(__be16));
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_h245_hook) nat_h245;
+
+ /* Read h245Address */
+ if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* Create expect for h245 connection */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = &nf_conntrack_helper_h245;
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
+ port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_q931: expect H.245 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/* If the calling party is on the same side of the forward-to party,
+ * we don't need to track the second call */
+static int callforward_do_filter(const union nf_inet_addr *src,
+ const union nf_inet_addr *dst,
+ u_int8_t family)
+{
+ const struct nf_afinfo *afinfo;
+ int ret = 0;
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ afinfo = nf_get_afinfo(family);
+ if (!afinfo)
+ return 0;
+
+ switch (family) {
+ case AF_INET: {
+ struct flowi4 fl1, fl2;
+ struct rtable *rt1, *rt2;
+
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->ip;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->ip;
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+ flowi4_to_flowi(&fl1), false)) {
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+ flowi4_to_flowi(&fl2), false)) {
+ if (rt_nexthop(rt1, fl1.daddr) ==
+ rt_nexthop(rt2, fl2.daddr) &&
+ rt1->dst.dev == rt2->dst.dev)
+ ret = 1;
+ dst_release(&rt2->dst);
+ }
+ dst_release(&rt1->dst);
+ }
+ break;
+ }
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+ case AF_INET6: {
+ struct flowi6 fl1, fl2;
+ struct rt6_info *rt1, *rt2;
+
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->in6;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->in6;
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+ flowi6_to_flowi(&fl1), false)) {
+ if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+ flowi6_to_flowi(&fl2), false)) {
+ if (ipv6_addr_equal(rt6_nexthop(rt1),
+ rt6_nexthop(rt2)) &&
+ rt1->dst.dev == rt2->dst.dev)
+ ret = 1;
+ dst_release(&rt2->dst);
+ }
+ dst_release(&rt1->dst);
+ }
+ break;
+ }
+#endif
+ }
+ return ret;
+
+}
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_callforwarding_hook) nat_callforwarding;
+
+ /* Read alternativeAddress */
+ if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
+ return 0;
+
+ /* If the calling party is on the same side of the forward-to party,
+ * we don't need to track the second call */
+ if (callforward_filter &&
+ callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3,
+ nf_ct_l3num(ct))) {
+ pr_debug("nf_ct_q931: Call Forwarding not tracked\n");
+ return 0;
+ }
+
+ /* Create expect for the second call leg */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* Need NAT */
+ ret = nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_q931: expect Call Forwarding ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Setup_UUIE *setup)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+ typeof(set_h225_addr_hook) set_h225_addr;
+
+ pr_debug("nf_ct_q931: Setup\n");
+
+ if (setup->options & eSetup_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &setup->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
+ get_h225_addr(ct, *data, &setup->destCallSignalAddress,
+ &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
+ pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+ &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
+ ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
+ ret = set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
+ get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
+ &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
+ pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+ &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
+ ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
+ ret = set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (setup->options & eSetup_UUIE_fastStart) {
+ for (i = 0; i < setup->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &setup->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_callproceeding(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ CallProceeding_UUIE *callproc)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: CallProceeding\n");
+
+ if (callproc->options & eCallProceeding_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &callproc->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (callproc->options & eCallProceeding_UUIE_fastStart) {
+ for (i = 0; i < callproc->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &callproc->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Connect_UUIE *connect)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Connect\n");
+
+ if (connect->options & eConnect_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &connect->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (connect->options & eConnect_UUIE_fastStart) {
+ for (i = 0; i < connect->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &connect->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Alerting_UUIE *alert)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Alerting\n");
+
+ if (alert->options & eAlerting_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &alert->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (alert->options & eAlerting_UUIE_fastStart) {
+ for (i = 0; i < alert->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &alert->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Facility_UUIE *facility)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Facility\n");
+
+ if (facility->reason.choice == eFacilityReason_callForwarded) {
+ if (facility->options & eFacility_UUIE_alternativeAddress)
+ return expect_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &facility->
+ alternativeAddress);
+ return 0;
+ }
+
+ if (facility->options & eFacility_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &facility->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (facility->options & eFacility_UUIE_fastStart) {
+ for (i = 0; i < facility->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &facility->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Progress_UUIE *progress)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Progress\n");
+
+ if (progress->options & eProgress_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &progress->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (progress->options & eProgress_UUIE_fastStart) {
+ for (i = 0; i < progress->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &progress->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ Q931 *q931)
+{
+ H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
+ int i;
+ int ret = 0;
+
+ switch (pdu->h323_message_body.choice) {
+ case eH323_UU_PDU_h323_message_body_setup:
+ ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.setup);
+ break;
+ case eH323_UU_PDU_h323_message_body_callProceeding:
+ ret = process_callproceeding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &pdu->h323_message_body.
+ callProceeding);
+ break;
+ case eH323_UU_PDU_h323_message_body_connect:
+ ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.connect);
+ break;
+ case eH323_UU_PDU_h323_message_body_alerting:
+ ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.alerting);
+ break;
+ case eH323_UU_PDU_h323_message_body_facility:
+ ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.facility);
+ break;
+ case eH323_UU_PDU_h323_message_body_progress:
+ ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.progress);
+ break;
+ default:
+ pr_debug("nf_ct_q931: Q.931 signal %d\n",
+ pdu->h323_message_body.choice);
+ break;
+ }
+
+ if (ret < 0)
+ return -1;
+
+ if (pdu->options & eH323_UU_PDU_h245Control) {
+ for (i = 0; i < pdu->h245Control.count; i++) {
+ ret = process_h245(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &pdu->h245Control.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int q931_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static Q931 q931;
+ unsigned char *data = NULL;
+ int datalen;
+ int dataoff;
+ int ret;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ pr_debug("nf_ct_q931: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Process each TPKT */
+ while (get_tpkt_data(skb, protoff, ct, ctinfo,
+ &data, &datalen, &dataoff)) {
+ pr_debug("nf_ct_q931: TPKT len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode Q.931 signal */
+ ret = DecodeQ931(data, datalen, &q931);
+ if (ret < 0) {
+ pr_debug("nf_ct_q931: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ /* We don't drop when decoding error */
+ break;
+ }
+
+ /* Process Q.931 signal */
+ if (process_q931(skb, ct, ctinfo, protoff,
+ &data, dataoff, &q931) < 0)
+ goto drop;
+ }
+
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process Q.931 message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy q931_exp_policy = {
+ /* T.120 and H.245 */
+ .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
+ {
+ .name = "Q.931",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = q931_help,
+ .expect_policy = &q931_exp_policy,
+ },
+ {
+ .name = "Q.931",
+ .me = THIS_MODULE,
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = q931_help,
+ .expect_policy = &q931_exp_policy,
+ },
+};
+
+/****************************************************************************/
+static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
+ int *datalen)
+{
+ const struct udphdr *uh;
+ struct udphdr _uh;
+ int dataoff;
+
+ uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh);
+ if (uh == NULL)
+ return NULL;
+ dataoff = protoff + sizeof(_uh);
+ if (dataoff >= skb->len)
+ return NULL;
+ *datalen = skb->len - dataoff;
+ return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
+}
+
+/****************************************************************************/
+static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
+ union nf_inet_addr *addr,
+ __be16 port)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple.src.u3, 0, sizeof(tuple.src.u3));
+ tuple.src.u.tcp.port = 0;
+ memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3));
+ tuple.dst.u.tcp.port = port;
+ tuple.dst.protonum = IPPROTO_TCP;
+
+ exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+ if (exp && exp->master == ct)
+ return exp;
+ return NULL;
+}
+
+/****************************************************************************/
+static int set_expect_timeout(struct nf_conntrack_expect *exp,
+ unsigned int timeout)
+{
+ if (!exp || !del_timer(&exp->timeout))
+ return 0;
+
+ exp->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&exp->timeout);
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_q931_hook) nat_q931;
+
+ /* Look for the first related address */
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3,
+ sizeof(addr)) == 0 && port != 0)
+ break;
+ }
+
+ if (i >= count) /* Not found */
+ return 0;
+
+ /* Create expect for Q.931 */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ gkrouted_only ? /* only accept calls from GK? */
+ &ct->tuplehash[!dir].tuple.src.u3 : NULL,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = nf_conntrack_helper_q931;
+ exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
+
+ nat_q931 = rcu_dereference(nat_q931_hook);
+ if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) { /* Need NAT */
+ ret = nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ /* Save port for looking up expect in processing RCF */
+ info->sig_port[dir] = port;
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, GatekeeperRequest *grq)
+{
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: GRQ\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) /* NATed */
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
+ return 0;
+}
+
+/****************************************************************************/
+static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, GatekeeperConfirm *gcf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+
+ pr_debug("nf_ct_ras: GCF\n");
+
+ if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port))
+ return 0;
+
+ /* Registration port is the same as discovery port */
+ if (!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ port == ct->tuplehash[dir].tuple.src.u.udp.port)
+ return 0;
+
+ /* Avoid RAS expectation loops. A GCF is never expected. */
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ return 0;
+
+ /* Need new expect */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_UDP, NULL, &port);
+ exp->helper = nf_conntrack_helper_ras;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect RAS ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RegistrationRequest *rrq)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int ret;
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: RRQ\n");
+
+ ret = expect_q931(skb, ct, ctinfo, protoff, data,
+ rrq->callSignalAddress.item,
+ rrq->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (rrq->options & eRegistrationRequest_timeToLive) {
+ pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
+ info->timeout = rrq->timeToLive;
+ } else
+ info->timeout = default_rrq_ttl;
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RegistrationConfirm *rcf)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ struct nf_conntrack_expect *exp;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: RCF\n");
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (rcf->options & eRegistrationConfirm_timeToLive) {
+ pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
+ info->timeout = rcf->timeToLive;
+ }
+
+ if (info->timeout > 0) {
+ pr_debug("nf_ct_ras: set RAS connection timeout to "
+ "%u seconds\n", info->timeout);
+ nf_ct_refresh(ct, skb, info->timeout * HZ);
+
+ /* Set expect timeout */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ if (exp) {
+ pr_debug("nf_ct_ras: set Q.931 expect "
+ "timeout to %u seconds for",
+ info->timeout);
+ nf_ct_dump_tuple(&exp->tuple);
+ set_expect_timeout(exp, info->timeout);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, UnregistrationRequest *urq)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: URQ\n");
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ /* Clear old expect */
+ nf_ct_remove_expectations(ct);
+ info->sig_port[dir] = 0;
+ info->sig_port[!dir] = 0;
+
+ /* Give it 30 seconds for UCF or URJ */
+ nf_ct_refresh(ct, skb, 30 * HZ);
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, AdmissionRequest *arq)
+{
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ __be16 port;
+ union nf_inet_addr addr;
+ typeof(set_h225_addr_hook) set_h225_addr;
+
+ pr_debug("nf_ct_ras: ARQ\n");
+
+ set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
+ get_h225_addr(ct, *data, &arq->destCallSignalAddress,
+ &addr, &port) &&
+ !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ port == info->sig_port[dir] &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ set_h225_addr && ct->status & IPS_NAT_MASK) {
+ /* Answering ARQ */
+ return set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ }
+
+ if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
+ get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
+ &addr, &port) &&
+ !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* Calling ARQ */
+ return set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, AdmissionConfirm *acf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: ACF\n");
+
+ if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress,
+ &addr, &port))
+ return 0;
+
+ if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ /* Answering ACF */
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_sig_addr(skb, ct, ctinfo, protoff, data,
+ &acf->destCallSignalAddress, 1);
+ return 0;
+ }
+
+ /* Need new expect */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, LocationRequest *lrq)
+{
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: LRQ\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
+ return 0;
+}
+
+/****************************************************************************/
+static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, LocationConfirm *lcf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+
+ pr_debug("nf_ct_ras: LCF\n");
+
+ if (!get_h225_addr(ct, *data, &lcf->callSignalAddress,
+ &addr, &port))
+ return 0;
+
+ /* Need new expect for call signal */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ /* Ignore rasAddress */
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, InfoRequestResponse *irr)
+{
+ int ret;
+ typeof(set_ras_addr_hook) set_ras_addr;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: IRR\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
+ if (ret < 0)
+ return -1;
+ }
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RasMessage *ras)
+{
+ switch (ras->choice) {
+ case eRasMessage_gatekeeperRequest:
+ return process_grq(skb, ct, ctinfo, protoff, data,
+ &ras->gatekeeperRequest);
+ case eRasMessage_gatekeeperConfirm:
+ return process_gcf(skb, ct, ctinfo, protoff, data,
+ &ras->gatekeeperConfirm);
+ case eRasMessage_registrationRequest:
+ return process_rrq(skb, ct, ctinfo, protoff, data,
+ &ras->registrationRequest);
+ case eRasMessage_registrationConfirm:
+ return process_rcf(skb, ct, ctinfo, protoff, data,
+ &ras->registrationConfirm);
+ case eRasMessage_unregistrationRequest:
+ return process_urq(skb, ct, ctinfo, protoff, data,
+ &ras->unregistrationRequest);
+ case eRasMessage_admissionRequest:
+ return process_arq(skb, ct, ctinfo, protoff, data,
+ &ras->admissionRequest);
+ case eRasMessage_admissionConfirm:
+ return process_acf(skb, ct, ctinfo, protoff, data,
+ &ras->admissionConfirm);
+ case eRasMessage_locationRequest:
+ return process_lrq(skb, ct, ctinfo, protoff, data,
+ &ras->locationRequest);
+ case eRasMessage_locationConfirm:
+ return process_lcf(skb, ct, ctinfo, protoff, data,
+ &ras->locationConfirm);
+ case eRasMessage_infoRequestResponse:
+ return process_irr(skb, ct, ctinfo, protoff, data,
+ &ras->infoRequestResponse);
+ default:
+ pr_debug("nf_ct_ras: RAS message %d\n", ras->choice);
+ break;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int ras_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static RasMessage ras;
+ unsigned char *data;
+ int datalen = 0;
+ int ret;
+
+ pr_debug("nf_ct_ras: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Get UDP data */
+ data = get_udp_data(skb, protoff, &datalen);
+ if (data == NULL)
+ goto accept;
+ pr_debug("nf_ct_ras: RAS message len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode RAS message */
+ ret = DecodeRasMessage(data, datalen, &ras);
+ if (ret < 0) {
+ pr_debug("nf_ct_ras: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ goto accept;
+ }
+
+ /* Process RAS message */
+ if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0)
+ goto drop;
+
+ accept:
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process RAS message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy ras_exp_policy = {
+ .max_expected = 32,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
+ {
+ .name = "RAS",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = ras_help,
+ .expect_policy = &ras_exp_policy,
+ },
+ {
+ .name = "RAS",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = ras_help,
+ .expect_policy = &ras_exp_policy,
+ },
+};
+
+/****************************************************************************/
+static void __exit nf_conntrack_h323_fini(void)
+{
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+ kfree(h323_buffer);
+ pr_debug("nf_ct_h323: fini\n");
+}
+
+/****************************************************************************/
+static int __init nf_conntrack_h323_init(void)
+{
+ int ret;
+
+ h323_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!h323_buffer)
+ return -ENOMEM;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245);
+ if (ret < 0)
+ goto err1;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
+ if (ret < 0)
+ goto err2;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
+ if (ret < 0)
+ goto err3;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
+ if (ret < 0)
+ goto err4;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+ if (ret < 0)
+ goto err5;
+ pr_debug("nf_ct_h323: init success\n");
+ return 0;
+
+err5:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+err4:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+err3:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+err2:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+err1:
+ kfree(h323_buffer);
+ return ret;
+}
+
+/****************************************************************************/
+module_init(nf_conntrack_h323_init);
+module_exit(nf_conntrack_h323_fini);
+
+EXPORT_SYMBOL_GPL(get_h225_addr);
+EXPORT_SYMBOL_GPL(set_h245_addr_hook);
+EXPORT_SYMBOL_GPL(set_h225_addr_hook);
+EXPORT_SYMBOL_GPL(set_sig_addr_hook);
+EXPORT_SYMBOL_GPL(set_ras_addr_hook);
+EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
+EXPORT_SYMBOL_GPL(nat_t120_hook);
+EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
+EXPORT_SYMBOL_GPL(nat_q931_hook);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_h323");
+MODULE_ALIAS_NFCT_HELPER("RAS");
+MODULE_ALIAS_NFCT_HELPER("Q.931");
+MODULE_ALIAS_NFCT_HELPER("H.245");
diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c
new file mode 100644
index 00000000000..d880f3523c1
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_types.c
@@ -0,0 +1,1922 @@
+/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ */
+
+static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE,
+ offsetof(TransportAddress_ipAddress, ip), NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */
+ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */
+ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _TransportAddress_ipSourceRoute_route},
+ {FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _TransportAddress_ipSourceRoute_routing},
+};
+
+static const struct field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */
+ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+ {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE,
+ offsetof(TransportAddress_ip6Address, ip), NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H221NonStandard[] = { /* SEQUENCE */
+ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _NonStandardIdentifier[] = { /* CHOICE */
+ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _H221NonStandard},
+};
+
+static const struct field_t _NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _NonStandardIdentifier},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress[] = { /* CHOICE */
+ {FNAME("ipAddress") SEQ, 0, 2, 2, DECODE,
+ offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress},
+ {FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0,
+ _TransportAddress_ipSourceRoute},
+ {FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0,
+ _TransportAddress_ipxAddress},
+ {FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(TransportAddress, ip6Address),
+ _TransportAddress_ip6Address},
+ {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _AliasAddress[] = { /* CHOICE */
+ {FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL},
+ {FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL},
+ {FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _VendorIdentifier[] = { /* SEQUENCE */
+ {FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard},
+ {FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _H310Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H320Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H321Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H322Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H323Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H324Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VoiceCaps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T120OnlyCaps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SupportedProtocols[] = { /* CHOICE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0,
+ _NonStandardParameter},
+ {FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps},
+ {FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps},
+ {FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps},
+ {FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps},
+ {FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps},
+ {FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps},
+ {FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps},
+ {FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps},
+ {FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols},
+};
+
+static const struct field_t _GatewayInfo[] = { /* SEQUENCE */
+ {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _GatewayInfo_protocol},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _McuInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _TerminalInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _EndpointType[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0,
+ _VendorIdentifier},
+ {FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0,
+ _GatekeeperInfo},
+ {FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo},
+ {FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo},
+ {FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo},
+ {FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT,
+ 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */
+ {FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */
+ {FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP,
+ 0, NULL},
+};
+
+static const struct field_t _Q954Details[] = { /* SEQUENCE */
+ {FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _QseriesOptions[] = { /* SEQUENCE */
+ {FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details},
+};
+
+static const struct field_t _CallType[] = { /* CHOICE */
+ {FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */
+ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier[] = { /* CHOICE */
+ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0,
+ _H245_NonStandardIdentifier_h221NonStandard},
+};
+
+static const struct field_t _H245_NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0,
+ _H245_NonStandardIdentifier},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H261VideoCapability[] = { /* SEQUENCE */
+ {FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H262VideoCapability[] = { /* SEQUENCE */
+ {FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H263VideoCapability[] = { /* SEQUENCE */
+ {FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _IS11172VideoCapability[] = { /* SEQUENCE */
+ {FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VideoCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0,
+ _H261VideoCapability},
+ {FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0,
+ _H262VideoCapability},
+ {FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0,
+ _H263VideoCapability},
+ {FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0,
+ _IS11172VideoCapability},
+ {FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _AudioCapability_g7231[] = { /* SEQUENCE */
+ {FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS11172AudioCapability[] = { /* SEQUENCE */
+ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS13818AudioCapability[] = { /* SEQUENCE */
+ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AudioCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231},
+ {FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0,
+ _IS11172AudioCapability},
+ {FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0,
+ _IS13818AudioCapability},
+ {FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+ {FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataProtocolCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */
+ {FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile[] = { /* CHOICE */
+ {FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0,
+ _T84Profile_t84Restricted},
+};
+
+static const struct field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */
+ {FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile},
+};
+
+static const struct field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */
+ {FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability_application[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT,
+ offsetof(DataApplicationCapability_application, t120),
+ _DataProtocolCapability},
+ {FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t84") SEQ, 0, 2, 2, SKIP, 0,
+ _DataApplicationCapability_application_t84},
+ {FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0,
+ _DataApplicationCapability_application_nlpid},
+ {FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+ {FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+ {FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability[] = { /* SEQUENCE */
+ {FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT,
+ offsetof(DataApplicationCapability, application),
+ _DataApplicationCapability_application},
+ {FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _EncryptionMode[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataType[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability},
+ {FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0,
+ _AudioCapability},
+ {FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data),
+ _DataApplicationCapability},
+ {FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _EncryptionMode},
+ {FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+ {FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */
+ {FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL},
+ {FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al3") SEQ, 0, 2, 2, SKIP, 0,
+ _H223LogicalChannelParameters_adaptationLayerType_al3},
+ {FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL},
+ {FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+ {FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0,
+ _H223LogicalChannelParameters_adaptationLayerType},
+ {FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CRCLength[] = { /* CHOICE */
+ {FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76HDLCParameters[] = { /* SEQUENCE */
+ {FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength},
+ {FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */
+ {FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */
+ {FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */
+ {FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode_eRM_recovery},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */
+ {FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode_eRM},
+ {FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V75Parameters[] = { /* SEQUENCE */
+ {FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _V76HDLCParameters},
+ {FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_suspendResume},
+ {FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode},
+ {FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters},
+};
+
+static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 4, 0, DECODE,
+ offsetof(UnicastAddress_iPAddress, network), NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */
+ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+ {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 16, 0, DECODE,
+ offsetof(UnicastAddress_iP6Address, network), NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */
+ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */
+ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */
+ {FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0,
+ _UnicastAddress_iPSourceRouteAddress_routing},
+ {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _UnicastAddress_iPSourceRouteAddress_route},
+};
+
+static const struct field_t _UnicastAddress[] = { /* CHOICE */
+ {FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress},
+ {FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _UnicastAddress_iPXAddress},
+ {FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address},
+ {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0,
+ _UnicastAddress_iPSourceRouteAddress},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress[] = { /* CHOICE */
+ {FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _MulticastAddress_iPAddress},
+ {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _MulticastAddress_iP6Address},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_TransportAddress[] = { /* CHOICE */
+ {FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT,
+ offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress},
+ {FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0,
+ _MulticastAddress},
+};
+
+static const struct field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _H2250LogicalChannelParameters_nonStandard},
+ {FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelParameters, mediaChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelParameters, mediaControlChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT,
+ 0, NULL},
+ {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL},
+ {FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+ _H222LogicalChannelParameters},
+ {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _H223LogicalChannelParameters},
+ {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+ _V76LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+ {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT,
+ offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+ dataType), _DataType},
+ {FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT,
+ offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters},
+ {FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+ 0, NULL},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _H223LogicalChannelParameters},
+ {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+ _V76LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType},
+ {FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannel_reverseLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters},
+ {FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+ 0, NULL},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */
+ {FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address_address[] = { /* CHOICE */
+ {FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL},
+ {FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address[] = { /* SEQUENCE */
+ {FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _Q2931Address_address},
+ {FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */
+ {FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address},
+ {FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT,
+ offsetof(NetworkAccessParameters_networkAddress, localAreaAddress),
+ _H245_TransportAddress},
+};
+
+static const struct field_t _NetworkAccessParameters[] = { /* SEQUENCE */
+ {FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0,
+ _NetworkAccessParameters_distribution},
+ {FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT,
+ offsetof(NetworkAccessParameters, networkAddress),
+ _NetworkAccessParameters_networkAddress},
+ {FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _OpenLogicalChannel[] = { /* SEQUENCE */
+ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT,
+ offsetof(OpenLogicalChannel, forwardLogicalChannelParameters),
+ _OpenLogicalChannel_forwardLogicalChannelParameters},
+ {FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannel,
+ reverseLogicalChannelParameters),
+ _OpenLogicalChannel_reverseLogicalChannelParameters},
+ {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannel, separateStack),
+ _NetworkAccessParameters},
+ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Setup_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, h245Address), _TransportAddress},
+ {FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_sourceAddress},
+ {FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destinationAddress},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destExtraCallInfo},
+ {FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destExtraCRV},
+ {FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0,
+ _Setup_UUIE_conferenceGoal},
+ {FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0,
+ _QseriesOptions},
+ {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+ {FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress},
+ {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart},
+ {FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _CallProceeding_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(CallProceeding_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(CallProceeding_UUIE, fastStart),
+ _CallProceeding_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Connect_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Connect_UUIE, h245Address), _TransportAddress},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Alerting_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Alerting_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Information_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ReleaseCompleteReason[] = { /* CHOICE */
+ {FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0,
+ NULL},
+ {FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0,
+ _ReleaseCompleteReason},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _FacilityReason[] = { /* CHOICE */
+ {FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Facility_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
+ {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Facility_UUIE_alternativeAliasAddress},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+ {FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT,
+ offsetof(Facility_UUIE, reason), _FacilityReason},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Facility_UUIE, h245Address), _TransportAddress},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _CallIdentifier[] = { /* SEQUENCE */
+ {FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityServiceMode[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+ {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityCapabilities[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+ {FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+ {FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+};
+
+static const struct field_t _H245Security[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+ {FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+ {FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+};
+
+static const struct field_t _DHset[] = { /* SEQUENCE */
+ {FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TypedCertificate[] = { /* SEQUENCE */
+ {FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H235_NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ClearToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset},
+ {FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL},
+ {FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0,
+ _TypedCertificate},
+ {FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _H235_NonStandardParameter},
+ {FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+};
+
+static const struct field_t _Params[] = { /* SEQUENCE */
+ {FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL},
+ {FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */
+ {FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */
+ {FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoEncryptedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoToken_cryptoSignedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoHashedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken[] = { /* CHOICE */
+ {FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0,
+ _CryptoToken_cryptoEncryptedToken},
+ {FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0,
+ _CryptoToken_cryptoSignedToken},
+ {FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoHashedToken},
+ {FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoPwdEncr},
+};
+
+static const struct field_t _CryptoH323Token[] = { /* CHOICE */
+ {FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdHash},
+ {FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdHash},
+ {FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdEncr},
+ {FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdEncr},
+ {FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoEPCert},
+ {FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoGKCert},
+ {FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoFastStart},
+ {FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0,
+ _CryptoToken},
+};
+
+static const struct field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token},
+};
+
+static const struct field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Progress_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Progress_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0,
+ _CallIdentifier},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ _H245Security},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Progress_UUIE_tokens},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Progress_UUIE_cryptoTokens},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */
+ {FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE},
+ {FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, callProceeding),
+ _CallProceeding_UUIE},
+ {FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE},
+ {FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE},
+ {FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE},
+ {FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0,
+ _ReleaseComplete_UUIE},
+ {FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE},
+ {FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE},
+ {FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _RequestMessage[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT,
+ offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel},
+ {FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL},
+ {FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL},
+ {FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+ {FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0,
+ NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+ _H222LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _H2250LogicalChannelAckParameters_nonStandard},
+ {FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelAckParameters, mediaChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelAckParameters, mediaControlChannel),
+ _H245_TransportAddress},
+ {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL},
+ {FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */
+ {FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT,
+ offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters,
+ h2250LogicalChannelAckParameters),
+ _H2250LogicalChannelAckParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */
+ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+ reverseLogicalChannelParameters),
+ _OpenLogicalChannelAck_reverseLogicalChannelParameters},
+ {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannelAck, separateStack),
+ _NetworkAccessParameters},
+ {FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+ forwardMultiplexAckParameters),
+ _OpenLogicalChannelAck_forwardMultiplexAckParameters},
+ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ResponseMessage[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT,
+ offsetof(ResponseMessage, openLogicalChannelAck),
+ _OpenLogicalChannelAck},
+ {FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+ {FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0,
+ NULL},
+ {FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _MultimediaSystemControlMessage[] = { /* CHOICE */
+ {FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT,
+ offsetof(MultimediaSystemControlMessage, request), _RequestMessage},
+ {FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT,
+ offsetof(MultimediaSystemControlMessage, response),
+ _ResponseMessage},
+ {FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL},
+ {FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT,
+ sizeof(MultimediaSystemControlMessage),
+ _MultimediaSystemControlMessage}
+ ,
+};
+
+static const struct field_t _H323_UU_PDU[] = { /* SEQUENCE */
+ {FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT,
+ offsetof(H323_UU_PDU, h323_message_body),
+ _H323_UU_PDU_h323_message_body},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT,
+ offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control},
+ {FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT,
+ 0, NULL},
+ {FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UserInformation[] = { /* SEQUENCE */
+ {FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT,
+ offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU},
+ {FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(GatekeeperRequest, rasAddress), _TransportAddress},
+ {FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(GatekeeperConfirm, rasAddress), _TransportAddress},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationRequest, callSignalAddress),
+ _RegistrationRequest_callSignalAddress},
+ {FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationRequest, rasAddress),
+ _RegistrationRequest_rasAddress},
+ {FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _RegistrationRequest_terminalAlias},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0,
+ _VendorIdentifier},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+ offsetof(RegistrationRequest, timeToLive), NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationConfirm, callSignalAddress),
+ _RegistrationConfirm_callSignalAddress},
+ {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _RegistrationConfirm_terminalAlias},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+ offsetof(RegistrationConfirm, timeToLive), NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _UnregistrationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(UnregistrationRequest, callSignalAddress),
+ _UnregistrationRequest_callSignalAddress},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL},
+ {FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _CallModel[] = { /* CHOICE */
+ {FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+ {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _AdmissionRequest_destinationInfo},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(AdmissionRequest, destCallSignalAddress),
+ _TransportAddress},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _AdmissionRequest_destExtraCallInfo},
+ {FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _AdmissionRequest_srcInfo},
+ {FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress},
+ {FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL},
+ {FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL},
+ {FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _AdmissionConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL},
+ {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(AdmissionConfirm, destCallSignalAddress),
+ _TransportAddress},
+ {FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _LocationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _LocationRequest_destinationInfo},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationRequest, replyAddress), _TransportAddress},
+ {FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationConfirm, callSignalAddress), _TransportAddress},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationConfirm, rasAddress), _TransportAddress},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _InfoRequestResponse[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(InfoRequestResponse, rasAddress), _TransportAddress},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(InfoRequestResponse, callSignalAddress),
+ _InfoRequestResponse_callSignalAddress},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RasMessage[] = { /* CHOICE */
+ {FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT,
+ offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest},
+ {FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT,
+ offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm},
+ {FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+ {FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT,
+ offsetof(RasMessage, registrationRequest), _RegistrationRequest},
+ {FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT,
+ offsetof(RasMessage, registrationConfirm), _RegistrationConfirm},
+ {FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+ {FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT,
+ offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest},
+ {FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL},
+ {FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT,
+ offsetof(RasMessage, admissionRequest), _AdmissionRequest},
+ {FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT,
+ offsetof(RasMessage, admissionConfirm), _AdmissionConfirm},
+ {FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL},
+ {FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL},
+ {FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL},
+ {FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT,
+ offsetof(RasMessage, locationRequest), _LocationRequest},
+ {FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT,
+ offsetof(RasMessage, locationConfirm), _LocationConfirm},
+ {FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL},
+ {FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL},
+ {FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT,
+ offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse},
+ {FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL},
+ {FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL},
+ {FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL},
+ {FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0,
+ NULL},
+ {FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0,
+ NULL},
+ {FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL},
+ {FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0,
+ NULL},
+ {FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL},
+};
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
new file mode 100644
index 00000000000..5b3eae7d4c9
--- /dev/null
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -0,0 +1,502 @@
+/* Helper handling for netfilter. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/random.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_log.h>
+
+static DEFINE_MUTEX(nf_ct_helper_mutex);
+struct hlist_head *nf_ct_helper_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hash);
+unsigned int nf_ct_helper_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
+static unsigned int nf_ct_helper_count __read_mostly;
+
+static bool nf_ct_auto_assign_helper __read_mostly = true;
+module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
+MODULE_PARM_DESC(nf_conntrack_helper,
+ "Enable automatic conntrack helper assignment (default 1)");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table helper_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_helper",
+ .data = &init_net.ct.sysctl_auto_assign_helper,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_auto_assign_helper;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.helper_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+
+ if (!net->ct.helper_sysctl_header) {
+ pr_err("nf_conntrack_helper: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.helper_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.helper_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+/* Stupid hash, but collision free for the default registrations of the
+ * helpers currently in the kernel. */
+static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
+{
+ return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^
+ (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
+}
+
+static struct nf_conntrack_helper *
+__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
+ unsigned int h;
+
+ if (!nf_ct_helper_count)
+ return NULL;
+
+ h = helper_hash(tuple);
+ hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
+ if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
+ return helper;
+ }
+ return NULL;
+}
+
+struct nf_conntrack_helper *
+__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
+{
+ struct nf_conntrack_helper *h;
+ unsigned int i;
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) {
+ if (!strcmp(h->name, name) &&
+ h->tuple.src.l3num == l3num &&
+ h->tuple.dst.protonum == protonum)
+ return h;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find);
+
+struct nf_conntrack_helper *
+nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
+{
+ struct nf_conntrack_helper *h;
+
+ h = __nf_conntrack_helper_find(name, l3num, protonum);
+#ifdef CONFIG_MODULES
+ if (h == NULL) {
+ if (request_module("nfct-helper-%s", name) == 0)
+ h = __nf_conntrack_helper_find(name, l3num, protonum);
+ }
+#endif
+ if (h != NULL && !try_module_get(h->me))
+ h = NULL;
+
+ return h;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
+
+struct nf_conn_help *
+nf_ct_helper_ext_add(struct nf_conn *ct,
+ struct nf_conntrack_helper *helper, gfp_t gfp)
+{
+ struct nf_conn_help *help;
+
+ help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER,
+ helper->data_len, gfp);
+ if (help)
+ INIT_HLIST_HEAD(&help->expectations);
+ else
+ pr_debug("failed to add helper extension area");
+ return help;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
+
+int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
+ gfp_t flags)
+{
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conn_help *help;
+ struct net *net = nf_ct_net(ct);
+ int ret = 0;
+
+ /* We already got a helper explicitly attached. The function
+ * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
+ * the helper up again. Since now the user is in full control of
+ * making consistent helper configurations, skip this automatic
+ * re-lookup, otherwise we'll lose the helper.
+ */
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ return 0;
+
+ if (tmpl != NULL) {
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = help->helper;
+ set_bit(IPS_HELPER_BIT, &ct->status);
+ }
+ }
+
+ help = nfct_help(ct);
+ if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
+ helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
+ pr_info("nf_conntrack: automatic helper "
+ "assignment is deprecated and it will "
+ "be removed soon. Use the iptables CT target "
+ "to attach helpers instead.\n");
+ net->ct.auto_assign_helper_warned = true;
+ }
+ }
+
+ if (helper == NULL) {
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ goto out;
+ }
+
+ if (help == NULL) {
+ help = nf_ct_helper_ext_add(ct, helper, flags);
+ if (help == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ /* We only allow helper re-assignment of the same sort since
+ * we cannot reallocate the helper extension area.
+ */
+ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper);
+
+ if (tmp && tmp->help != helper->help) {
+ RCU_INIT_POINTER(help->helper, NULL);
+ goto out;
+ }
+ }
+
+ rcu_assign_pointer(help->helper, helper);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+
+/* appropiate ct lock protecting must be taken by caller */
+static inline int unhelp(struct nf_conntrack_tuple_hash *i,
+ const struct nf_conntrack_helper *me)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
+ struct nf_conn_help *help = nfct_help(ct);
+
+ if (help && rcu_dereference_raw(help->helper) == me) {
+ nf_conntrack_event(IPCT_HELPER, ct);
+ RCU_INIT_POINTER(help->helper, NULL);
+ }
+ return 0;
+}
+
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_helper *helper;
+
+ if (help) {
+ rcu_read_lock();
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->destroy)
+ helper->destroy(ct);
+ rcu_read_unlock();
+ }
+}
+
+static LIST_HEAD(nf_ct_helper_expectfn_list);
+
+void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_add_rcu(&n->head, &nf_ct_helper_expectfn_list);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);
+
+void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_del_rcu(&n->head);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_name(const char *name)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (!strcmp(cur->name, name)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (cur->expectfn == symbol) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
+
+__printf(3, 4)
+void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
+ const char *fmt, ...)
+{
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ /* Called from the helper function, this call never fails */
+ help = nfct_help(ct);
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+
+ nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+ "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_log);
+
+int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
+{
+ int ret = 0;
+ struct nf_conntrack_helper *cur;
+ unsigned int h = helper_hash(&me->tuple);
+
+ BUG_ON(me->expect_policy == NULL);
+ BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
+ BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+
+ mutex_lock(&nf_ct_helper_mutex);
+ hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
+ if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
+ cur->tuple.src.l3num == me->tuple.src.l3num &&
+ cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
+ nf_ct_helper_count++;
+out:
+ mutex_unlock(&nf_ct_helper_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
+
+static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
+ struct net *net)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_expect *exp;
+ const struct hlist_node *next;
+ const struct hlist_nulls_node *nn;
+ unsigned int i;
+ int cpu;
+
+ /* Get rid of expectations */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i], hnode) {
+ struct nf_conn_help *help = nfct_help(exp->master);
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock)
+ ) == me || exp->helper == me) &&
+ del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+ unhelp(h, me);
+ spin_unlock_bh(&pcpu->lock);
+ }
+ local_bh_disable();
+ for (i = 0; i < net->ct.htable_size; i++) {
+ spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ unhelp(h, me);
+ }
+ spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ }
+ local_bh_enable();
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+ struct net *net;
+
+ mutex_lock(&nf_ct_helper_mutex);
+ hlist_del_rcu(&me->hnode);
+ nf_ct_helper_count--;
+ mutex_unlock(&nf_ct_helper_mutex);
+
+ /* Make sure every nothing is still using the helper unless its a
+ * connection in the hash.
+ */
+ synchronize_rcu();
+
+ rtnl_lock();
+ for_each_net(net)
+ __nf_conntrack_helper_unregister(me, net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
+
+static struct nf_ct_ext_type helper_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_help),
+ .align = __alignof__(struct nf_conn_help),
+ .id = NF_CT_EXT_HELPER,
+};
+
+int nf_conntrack_helper_pernet_init(struct net *net)
+{
+ net->ct.auto_assign_helper_warned = false;
+ net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
+ return nf_conntrack_helper_init_sysctl(net);
+}
+
+void nf_conntrack_helper_pernet_fini(struct net *net)
+{
+ nf_conntrack_helper_fini_sysctl(net);
+}
+
+int nf_conntrack_helper_init(void)
+{
+ int ret;
+ nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
+ nf_ct_helper_hash =
+ nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
+ if (!nf_ct_helper_hash)
+ return -ENOMEM;
+
+ ret = nf_ct_extend_register(&helper_extend);
+ if (ret < 0) {
+ pr_err("nf_ct_helper: Unable to register helper extension.\n");
+ goto out_extend;
+ }
+
+ return 0;
+out_extend:
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ return ret;
+}
+
+void nf_conntrack_helper_fini(void)
+{
+ nf_ct_extend_unregister(&helper_extend);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
new file mode 100644
index 00000000000..0fd2976db7e
--- /dev/null
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -0,0 +1,292 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+static unsigned int max_dcc_channels = 8;
+static unsigned int dcc_timeout __read_mostly = 300;
+/* This is slow, but it's simple. --RR */
+static char *irc_buffer;
+static DEFINE_SPINLOCK(irc_buffer_lock);
+
+unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_irc");
+MODULE_ALIAS_NFCT_HELPER("irc");
+
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, uint, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per "
+ "IRC session");
+module_param(dcc_timeout, uint, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+
+static const char *const dccprotos[] = {
+ "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT "
+};
+
+#define MINMATCHLEN 5
+
+/* tries to get the ip_addr and port out of a dcc command
+ * return value: -1 on failure, 0 on success
+ * data pointer to first byte of DCC command data
+ * data_end pointer to last byte of dcc command data
+ * ip returns parsed ip of dcc command
+ * port returns parsed port of dcc command
+ * ad_beg_p returns pointer to first byte of addr data
+ * ad_end_p returns pointer to last byte of addr data
+ */
+static int parse_dcc(char *data, const char *data_end, __be32 *ip,
+ u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+{
+ char *tmp;
+
+ /* at least 12: "AAAAAAAA P\1\n" */
+ while (*data++ != ' ')
+ if (data > data_end - 12)
+ return -1;
+
+ /* Make sure we have a newline character within the packet boundaries
+ * because simple_strtoul parses until the first invalid character. */
+ for (tmp = data; tmp <= data_end; tmp++)
+ if (*tmp == '\n')
+ break;
+ if (tmp > data_end || *tmp != '\n')
+ return -1;
+
+ *ad_beg_p = data;
+ *ip = cpu_to_be32(simple_strtoul(data, &data, 10));
+
+ /* skip blanks between ip and port */
+ while (*data == ' ') {
+ if (data >= data_end)
+ return -1;
+ data++;
+ }
+
+ *port = simple_strtoul(data, &data, 10);
+ *ad_end_p = data;
+
+ return 0;
+}
+
+static int help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff;
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ const char *data_limit;
+ char *data, *ib_ptr;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ __be32 dcc_ip;
+ u_int16_t dcc_port;
+ __be16 port;
+ int i, ret = NF_ACCEPT;
+ char *addr_beg_p, *addr_end_p;
+ typeof(nf_nat_irc_hook) nf_nat_irc;
+
+ /* If packet is coming from IRC server */
+ if (dir == IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* Not a full tcp header? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ /* No data? */
+ dataoff = protoff + th->doff*4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ spin_lock_bh(&irc_buffer_lock);
+ ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ irc_buffer);
+ BUG_ON(ib_ptr == NULL);
+
+ data = ib_ptr;
+ data_limit = ib_ptr + skb->len - dataoff;
+
+ /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+ * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+ while (data < data_limit - (19 + MINMATCHLEN)) {
+ if (memcmp(data, "\1DCC ", 5)) {
+ data++;
+ continue;
+ }
+ data += 5;
+ /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+
+ iph = ip_hdr(skb);
+ pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest));
+
+ for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+ if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+ /* no match */
+ continue;
+ }
+ data += strlen(dccprotos[i]);
+ pr_debug("DCC %s detected\n", dccprotos[i]);
+
+ /* we have at least
+ * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * data left (== 14/13 bytes) */
+ if (parse_dcc(data, data_limit, &dcc_ip,
+ &dcc_port, &addr_beg_p, &addr_end_p)) {
+ pr_debug("unable to parse dcc command\n");
+ continue;
+ }
+
+ pr_debug("DCC bound ip/port: %pI4:%u\n",
+ &dcc_ip, dcc_port);
+
+ /* dcc_ip can be the internal OR external (NAT'ed) IP */
+ tuple = &ct->tuplehash[dir].tuple;
+ if (tuple->src.u3.ip != dcc_ip &&
+ tuple->dst.u3.ip != dcc_ip) {
+ net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
+ &tuple->src.u3.ip,
+ &dcc_ip, dcc_port);
+ continue;
+ }
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct,
+ "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+ tuple = &ct->tuplehash[!dir].tuple;
+ port = htons(dcc_port);
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ tuple->src.l3num,
+ NULL, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &port);
+
+ nf_nat_irc = rcu_dereference(nf_nat_irc_hook);
+ if (nf_nat_irc && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_irc(skb, ctinfo, protoff,
+ addr_beg_p - ib_ptr,
+ addr_end_p - addr_beg_p,
+ exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ goto out;
+ }
+ }
+ out:
+ spin_unlock_bh(&irc_buffer_lock);
+ return ret;
+}
+
+static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
+static struct nf_conntrack_expect_policy irc_exp_policy;
+
+static void nf_conntrack_irc_fini(void);
+
+static int __init nf_conntrack_irc_init(void)
+{
+ int i, ret;
+
+ if (max_dcc_channels < 1) {
+ printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n");
+ return -EINVAL;
+ }
+
+ irc_exp_policy.max_expected = max_dcc_channels;
+ irc_exp_policy.timeout = dcc_timeout;
+
+ irc_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!irc_buffer)
+ return -ENOMEM;
+
+ /* If no port given, default to standard irc port */
+ if (ports_c == 0)
+ ports[ports_c++] = IRC_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ irc[i].tuple.src.l3num = AF_INET;
+ irc[i].tuple.src.u.tcp.port = htons(ports[i]);
+ irc[i].tuple.dst.protonum = IPPROTO_TCP;
+ irc[i].expect_policy = &irc_exp_policy;
+ irc[i].me = THIS_MODULE;
+ irc[i].help = help;
+
+ if (ports[i] == IRC_PORT)
+ sprintf(irc[i].name, "irc");
+ else
+ sprintf(irc[i].name, "irc-%u", i);
+
+ ret = nf_conntrack_helper_register(&irc[i]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_irc: failed to register helper "
+ "for pf: %u port: %u\n",
+ irc[i].tuple.src.l3num, ports[i]);
+ nf_conntrack_irc_fini();
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/* This function is intentionally _NOT_ defined as __exit, because
+ * it is needed by the init function */
+static void nf_conntrack_irc_fini(void)
+{
+ int i;
+
+ for (i = 0; i < ports_c; i++)
+ nf_conntrack_helper_unregister(&irc[i]);
+ kfree(irc_buffer);
+}
+
+module_init(nf_conntrack_irc_init);
+module_exit(nf_conntrack_irc_fini);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index 7de4f06c63c..e7eb807fe07 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -15,7 +15,6 @@
* Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
@@ -27,35 +26,27 @@
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat);
-
-static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
+static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
{
memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
- return 1;
+ return true;
}
-static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
- return 1;
+ return true;
}
static int generic_print_tuple(struct seq_file *s,
@@ -64,35 +55,20 @@ static int generic_print_tuple(struct seq_file *s,
return 0;
}
-static int generic_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
-static int
-generic_prepare(struct sk_buff **pskb, unsigned int hooknum,
- unsigned int *dataoff, u_int8_t *protonum)
+static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ unsigned int *dataoff, u_int8_t *protonum)
{
/* Never track !!! */
return -NF_ACCEPT;
}
-static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple)
-
-{
- return NF_CT_F_BASIC;
-}
-
-struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = {
+struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
.l3proto = PF_UNSPEC,
.name = "unknown",
.pkt_to_tuple = generic_pkt_to_tuple,
.invert_tuple = generic_invert_tuple,
.print_tuple = generic_print_tuple,
- .print_conntrack = generic_print_conntrack,
- .prepare = generic_prepare,
- .get_features = generic_get_features,
- .me = THIS_MODULE,
+ .get_l4proto = generic_get_l4proto,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
new file mode 100644
index 00000000000..bb53f120e79
--- /dev/null
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -0,0 +1,108 @@
+/*
+ * test/set flag bits stored in conntrack extension area.
+ *
+ * (C) 2013 Astaro GmbH & Co KG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+static unsigned int label_bits(const struct nf_conn_labels *l)
+{
+ unsigned int longs = l->words;
+ return longs * BITS_PER_LONG;
+}
+
+bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return false;
+
+ return bit < label_bits(labels) && test_bit(bit, labels->bits);
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_match);
+
+int nf_connlabel_set(struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels || bit >= label_bits(labels))
+ return -ENOSPC;
+
+ if (test_bit(bit, labels->bits))
+ return 0;
+
+ if (!test_and_set_bit(bit, labels->bits))
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_set);
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static void replace_u32(u32 *address, u32 mask, u32 new)
+{
+ u32 old, tmp;
+
+ do {
+ old = *address;
+ tmp = (old & mask) ^ new;
+ } while (cmpxchg(address, old, tmp) != old);
+}
+
+int nf_connlabels_replace(struct nf_conn *ct,
+ const u32 *data,
+ const u32 *mask, unsigned int words32)
+{
+ struct nf_conn_labels *labels;
+ unsigned int size, i;
+ u32 *dst;
+
+ labels = nf_ct_labels_find(ct);
+ if (!labels)
+ return -ENOSPC;
+
+ size = labels->words * sizeof(long);
+ if (size < (words32 * sizeof(u32)))
+ words32 = size / sizeof(u32);
+
+ dst = (u32 *) labels->bits;
+ if (words32) {
+ for (i = 0; i < words32; i++)
+ replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
+ }
+
+ size /= sizeof(u32);
+ for (i = words32; i < size; i++) /* pad */
+ replace_u32(&dst[i], 0, 0);
+
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_replace);
+#endif
+
+static struct nf_ct_ext_type labels_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_labels),
+ .align = __alignof__(struct nf_conn_labels),
+ .id = NF_CT_EXT_LABELS,
+};
+
+int nf_conntrack_labels_init(void)
+{
+ return nf_ct_extend_register(&labels_extend);
+}
+
+void nf_conntrack_labels_fini(void)
+{
+ nf_ct_extend_unregister(&labels_extend);
+}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
new file mode 100644
index 00000000000..4c8f30a3d6d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -0,0 +1,71 @@
+/*
+ * NetBIOS name service broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * This helper tracks locally originating NetBIOS name service
+ * requests by issuing permanent expectations (valid until
+ * timing out) matching all reply connections from the
+ * destination network. The only NetBIOS specific thing is
+ * actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define NMBD_PORT 137
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_netbios_ns");
+MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+
+static unsigned int timeout __read_mostly = 3;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "netbios-ns",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = netbios_ns_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_netbios_ns_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_netbios_ns_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_netbios_ns_init);
+module_exit(nf_conntrack_netbios_ns_fini);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9ff3463037e..300ed1eec72 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2,575 +2,930 @@
* protocol helpers and general trouble making from userspace.
*
* (C) 2001 by Jay Schulist <jschlst@samba.org>
- * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
* (C) 2003 by Patrick Mchardy <kaber@trash.net>
- * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>
*
- * I've reworked this stuff to use attributes instead of conntrack
- * structures. 5.44 am. I need more tea. --pablo 05/07/11.
- *
- * Initial connection tracking via netlink development funded and
+ * Initial connection tracking via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
*
* Further development of this code funded by Astaro AG (http://www.astaro.com)
*
* This software may be used and distributed according to the terms
* of the GNU General Public License, incorporated herein by reference.
- *
- * Derived from ip_conntrack_netlink.c: Port by Pablo Neira Ayuso (05/11/14)
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
#include <linux/types.h>
#include <linux/timer.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netlink.h>
#include <linux/spinlock.h>
-#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_helper.h>
+#endif
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
MODULE_LICENSE("GPL");
-static char __initdata version[] = "0.92";
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
+static char __initdata version[] = "0.93";
static inline int
-ctnetlink_dump_tuples_proto(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l4proto *l4proto)
{
- struct nf_conntrack_protocol *proto;
int ret = 0;
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
+ goto nla_put_failure;
- NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+ if (likely(l4proto->tuple_to_nlattr))
+ ret = l4proto->tuple_to_nlattr(skb, tuple);
- /* If no protocol helper is found, this function will return the
- * generic protocol helper, so proto won't *ever* be NULL */
- proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum);
- if (likely(proto->tuple_to_nfattr))
- ret = proto->tuple_to_nfattr(skb, tuple);
-
- nf_ct_proto_put(proto);
+ nla_nest_end(skb, nest_parms);
return ret;
-nfattr_failure:
+nla_put_failure:
return -1;
}
static inline int
-ctnetlink_dump_tuples(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
+ctnetlink_dump_tuples_ip(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto)
{
- struct nfattr *nest_parms;
- struct nf_conntrack_l3proto *l3proto;
int ret = 0;
-
- l3proto = nf_ct_l3proto_find_get(tuple->src.l3num);
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
- if (likely(l3proto->tuple_to_nfattr))
- ret = l3proto->tuple_to_nfattr(skb, tuple);
- NFA_NEST_END(skb, nest_parms);
+ struct nlattr *nest_parms;
- nf_ct_l3proto_put(l3proto);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
- if (unlikely(ret < 0))
- return ret;
+ if (likely(l3proto->tuple_to_nlattr))
+ ret = l3proto->tuple_to_nlattr(skb, tuple);
- nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
- ret = ctnetlink_dump_tuples_proto(skb, tuple);
- NFA_NEST_END(skb, nest_parms);
+ nla_nest_end(skb, nest_parms);
return ret;
-nfattr_failure:
+nla_put_failure:
return -1;
}
+static int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ int ret;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
static inline int
ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
- u_int32_t status = htonl((u_int32_t) ct->status);
- NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+ if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
- long timeout_l = ct->timeout.expires - jiffies;
- u_int32_t timeout;
+ long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
- if (timeout_l < 0)
+ if (timeout < 0)
timeout = 0;
- else
- timeout = htonl(timeout_l / HZ);
-
- NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+
+ if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
- struct nf_conntrack_protocol *proto = nf_ct_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
- struct nfattr *nest_proto;
+ struct nf_conntrack_l4proto *l4proto;
+ struct nlattr *nest_proto;
int ret;
- if (!proto->to_nfattr) {
- nf_ct_proto_put(proto);
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (!l4proto->to_nlattr)
return 0;
- }
-
- nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
- ret = proto->to_nfattr(skb, nest_proto, ct);
+ nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED);
+ if (!nest_proto)
+ goto nla_put_failure;
- nf_ct_proto_put(proto);
+ ret = l4proto->to_nlattr(skb, nest_proto, ct);
- NFA_NEST_END(skb, nest_proto);
+ nla_nest_end(skb, nest_proto);
return ret;
-nfattr_failure:
+nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
{
- struct nfattr *nest_helper;
+ struct nlattr *nest_helper;
+ const struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_helper *helper;
- if (!ct->helper)
+ if (!help)
return 0;
-
- nest_helper = NFA_NEST(skb, CTA_HELP);
- NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name);
- if (ct->helper->to_nfattr)
- ct->helper->to_nfattr(skb, ct);
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ goto out;
+
+ nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
+ if (!nest_helper)
+ goto nla_put_failure;
+ if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
+ goto nla_put_failure;
- NFA_NEST_END(skb, nest_helper);
+ if (helper->to_nlattr)
+ helper->to_nlattr(skb, ct);
+ nla_nest_end(skb, nest_helper);
+out:
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
-#ifdef CONFIG_NF_CT_ACCT
-static inline int
-ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
- enum ip_conntrack_dir dir)
+static int
+dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
+ enum ip_conntrack_dir dir, int type)
{
- enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
- struct nfattr *nest_count = NFA_NEST(skb, type);
- u_int32_t tmp;
+ enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nf_conn_counter *counter = acct->counter;
+ struct nlattr *nest_count;
+ u64 pkts, bytes;
+
+ if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&counter[dir].packets, 0);
+ bytes = atomic64_xchg(&counter[dir].bytes, 0);
+ } else {
+ pkts = atomic64_read(&counter[dir].packets);
+ bytes = atomic64_read(&counter[dir].bytes);
+ }
- tmp = htonl(ct->counters[dir].packets);
- NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
+ nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
- tmp = htonl(ct->counters[dir].bytes);
- NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
+ if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
- NFA_NEST_END(skb, nest_count);
+ nla_nest_end(skb, nest_count);
return 0;
-nfattr_failure:
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
+{
+ struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+
+ if (!acct)
+ return 0;
+
+ if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
+ return -1;
+ if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_count;
+ const struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (!tstamp)
+ return 0;
+
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
+ (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
+ cpu_to_be64(tstamp->stop))))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
return -1;
}
-#else
-#define ctnetlink_dump_counters(a, b, c) (0)
-#endif
#ifdef CONFIG_NF_CONNTRACK_MARK
static inline int
ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
{
- u_int32_t mark = htonl(ct->mark);
-
- NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+ if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
#else
#define ctnetlink_dump_mark(a, b) (0)
#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline int
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_secctx;
+ int len, ret;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return 0;
+
+ ret = -1;
+ nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+ if (!nest_secctx)
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_secctx);
+
+ ret = 0;
+nla_put_failure:
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+#define ctnetlink_dump_secctx(a, b) (0)
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+static int ctnetlink_label_size(const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return 0;
+ return nla_total_size(labels->words * sizeof(long));
+}
+
+static int
+ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int len, i;
+
+ if (!labels)
+ return 0;
+
+ len = labels->words * sizeof(long);
+ i = 0;
+ do {
+ if (labels->bits[i] != 0)
+ return nla_put(skb, CTA_LABELS, len, labels->bits);
+ i++;
+ } while (i < labels->words);
+
+ return 0;
+}
+#else
+#define ctnetlink_dump_labels(a, b) (0)
+#define ctnetlink_label_size(a) (0)
+#endif
+
+#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+
+static inline int
+ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ if (!(ct->status & IPS_EXPECTED))
+ return 0;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
+ htonl(seq->correction_pos)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
+ htonl(seq->offset_before)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
+ htonl(seq->offset_after)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *seq;
+
+ if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
+ return 0;
+
+ seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
+ return -1;
+
+ seq = &seqadj->seq[IP_CT_DIR_REPLY];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
+ return -1;
+
+ return 0;
+}
+
static inline int
ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
- u_int32_t id = htonl(ct->id);
- NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+ if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- u_int32_t use = htonl(atomic_read(&ct->ct_general.use));
-
- NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+ if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ goto nla_put_failure;
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
-#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
-
static int
-ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event, int nowait,
- const struct nf_conn *ct)
+ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct nf_conn *ct)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- struct nfattr *nest_parms;
- unsigned char *b;
+ struct nlattr *nest_parms;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
- b = skb->tail;
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
- event |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
-
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
- nfmsg->nfgen_family =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = nf_ct_l3num(ct);
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
- nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0 ||
ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
+ ctnetlink_dump_labels(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
- ctnetlink_dump_use(skb, ct) < 0)
- goto nfattr_failure;
+ ctnetlink_dump_use(skb, ct) < 0 ||
+ ctnetlink_dump_master(skb, ct) < 0 ||
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
- nlh->nlmsg_len = skb->tail - b;
+ nlmsg_end(skb, nlh);
return skb->len;
nlmsg_failure:
-nfattr_failure:
- skb_trim(skb, b - skb->data);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
return -1;
}
+static inline size_t
+ctnetlink_proto_size(const struct nf_conn *ct)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ size_t len = 0;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+ len += l3proto->nla_size;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ len += l4proto->nla_size;
+ rcu_read_unlock();
+
+ return len;
+}
+
+static inline size_t
+ctnetlink_acct_size(const struct nf_conn *ct)
+{
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
+ return 0;
+ return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
+ + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+ + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
+ ;
+}
+
+static inline int
+ctnetlink_secctx_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ int len, ret;
+
+ ret = security_secid_to_secctx(ct->secmark, NULL, &len);
+ if (ret)
+ return 0;
+
+ return nla_total_size(0) /* CTA_SECCTX */
+ + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ return 0;
+ return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_nlmsg_size(const struct nf_conn *ct)
+{
+ return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+ + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ + nla_total_size(0) /* CTA_PROTOINFO */
+ + nla_total_size(0) /* CTA_HELP */
+ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+ + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ + ctnetlink_proto_size(ct)
+ + ctnetlink_label_size(ct)
+ ;
+}
+
#ifdef CONFIG_NF_CONNTRACK_EVENTS
-static int ctnetlink_conntrack_event(struct notifier_block *this,
- unsigned long events, void *ptr)
+static int
+ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
+ struct net *net;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- struct nfattr *nest_parms;
- struct nf_conn *ct = (struct nf_conn *)ptr;
+ struct nlattr *nest_parms;
+ struct nf_conn *ct = item->ct;
struct sk_buff *skb;
unsigned int type;
- unsigned char *b;
unsigned int flags = 0, group;
+ int err;
/* ignore our fake conntrack entry */
- if (ct == &nf_conntrack_untracked)
- return NOTIFY_DONE;
+ if (nf_ct_is_untracked(ct))
+ return 0;
- if (events & IPCT_DESTROY) {
+ if (events & (1 << IPCT_DESTROY)) {
type = IPCTNL_MSG_CT_DELETE;
group = NFNLGRP_CONNTRACK_DESTROY;
- } else if (events & (IPCT_NEW | IPCT_RELATED)) {
+ } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
type = IPCTNL_MSG_CT_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
- /* dump everything */
- events = ~0UL;
group = NFNLGRP_CONNTRACK_NEW;
- } else if (events & (IPCT_STATUS |
- IPCT_PROTOINFO |
- IPCT_HELPER |
- IPCT_HELPINFO |
- IPCT_NATINFO)) {
+ } else if (events) {
type = IPCTNL_MSG_CT_NEW;
group = NFNLGRP_CONNTRACK_UPDATE;
} else
- return NOTIFY_DONE;
-
- /* FIXME: Check if there are any listeners before, don't hurt performance */
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
- if (!skb)
- return NOTIFY_DONE;
+ return 0;
+
+ net = nf_ct_net(ct);
+ if (!item->report && !nfnetlink_has_listeners(net, group))
+ return 0;
- b = skb->tail;
+ skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
type |= NFNL_SUBSYS_CTNETLINK << 8;
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
- nlh->nlmsg_flags = flags;
- nfmsg->nfgen_family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = nf_ct_l3num(ct);
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
- nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
- if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
- goto nfattr_failure;
- NFA_NEST_END(skb, nest_parms);
-
- /* NAT stuff is now a status flag */
- if ((events & IPCT_STATUS || events & IPCT_NATINFO)
- && ctnetlink_dump_status(skb, ct) < 0)
- goto nfattr_failure;
- if (events & IPCT_REFRESH
- && ctnetlink_dump_timeout(skb, ct) < 0)
- goto nfattr_failure;
- if (events & IPCT_PROTOINFO
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
- goto nfattr_failure;
- if (events & IPCT_HELPINFO
- && ctnetlink_dump_helpinfo(skb, ct) < 0)
- goto nfattr_failure;
-
- if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
- goto nfattr_failure;
-
- nlh->nlmsg_len = skb->tail - b;
- nfnetlink_send(skb, 0, group, 0);
- return NOTIFY_DONE;
+ rcu_read_lock();
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_id(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
+ goto nla_put_failure;
+ } else {
+ if (ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_PROTOINFO)
+ && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
+ && ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ if ((events & (1 << IPCT_SECMARK) || ct->secmark)
+ && ctnetlink_dump_secctx(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (events & (1 << IPCT_LABEL) &&
+ ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_RELATED) &&
+ ctnetlink_dump_master(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_SEQADJ) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+ }
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if ((events & (1 << IPCT_MARK) || ct->mark)
+ && ctnetlink_dump_mark(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ rcu_read_unlock();
+
+ nlmsg_end(skb, nlh);
+ err = nfnetlink_send(skb, net, item->portid, group, item->report,
+ GFP_ATOMIC);
+ if (err == -ENOBUFS || err == -EAGAIN)
+ return -ENOBUFS;
+
+ return 0;
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_cancel(skb, nlh);
nlmsg_failure:
-nfattr_failure:
kfree_skb(skb);
- return NOTIFY_DONE;
+errout:
+ if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
+ return -ENOBUFS;
+
+ return 0;
}
#endif /* CONFIG_NF_CONNTRACK_EVENTS */
static int ctnetlink_done(struct netlink_callback *cb)
{
- DEBUGP("entered %s\n", __FUNCTION__);
+ if (cb->args[1])
+ nf_ct_put((struct nf_conn *)cb->args[1]);
+ if (cb->data)
+ kfree(cb->data);
return 0;
}
-#define L3PROTO(ct) ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num
+struct ctnetlink_dump_filter {
+ struct {
+ u_int32_t val;
+ u_int32_t mask;
+ } mark;
+};
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conn *ct = NULL;
+ struct net *net = sock_net(skb->sk);
+ struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
- struct list_head *i;
- u_int32_t *id = (u_int32_t *) &cb->args[1];
- struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
+ struct hlist_nulls_node *n;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ spinlock_t *lockp;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ const struct ctnetlink_dump_filter *filter = cb->data;
+#endif
- DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
- cb->args[0], *id);
+ last = (struct nf_conn *)cb->args[1];
- read_lock_bh(&nf_conntrack_lock);
- for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) {
- list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
- h = (struct nf_conntrack_tuple_hash *) i;
- if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ local_bh_disable();
+ for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+restart:
+ lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (cb->args[0] >= net->ct.htable_size) {
+ spin_unlock(lockp);
+ goto out;
+ }
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
+ hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
- if (l3proto && L3PROTO(ct) != l3proto)
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
continue;
- if (ct->id <= *id)
+ if (cb->args[1]) {
+ if (ct != last)
+ continue;
+ cb->args[1] = 0;
+ }
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (filter && !((ct->mark & filter->mark.mask) ==
+ filter->mark.val)) {
continue;
- if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW,
- 1, ct) < 0)
+ }
+#endif
+ rcu_read_lock();
+ res =
+ ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general);
+ cb->args[1] = (unsigned long)ct;
+ spin_unlock(lockp);
goto out;
- *id = ct->id;
+ }
}
- }
-out:
- read_unlock_bh(&nf_conntrack_lock);
-
- DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
-
- return skb->len;
-}
-
-#ifdef CONFIG_NF_CT_ACCT
-static int
-ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct nf_conn *ct = NULL;
- struct nf_conntrack_tuple_hash *h;
- struct list_head *i;
- u_int32_t *id = (u_int32_t *) &cb->args[1];
- struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
-
- DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
- cb->args[0], *id);
-
- write_lock_bh(&nf_conntrack_lock);
- for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) {
- list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
- h = (struct nf_conntrack_tuple_hash *) i;
- if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (l3proto && L3PROTO(ct) != l3proto)
- continue;
- if (ct->id <= *id)
- continue;
- if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW,
- 1, ct) < 0)
- goto out;
- *id = ct->id;
-
- memset(&ct->counters, 0, sizeof(ct->counters));
+ spin_unlock(lockp);
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
}
}
-out:
- write_unlock_bh(&nf_conntrack_lock);
-
- DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+out:
+ local_bh_enable();
+ if (last)
+ nf_ct_put(last);
return skb->len;
}
-#endif
static inline int
-ctnetlink_parse_tuple_ip(struct nfattr *attr, struct nf_conntrack_tuple *tuple)
+ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
{
- struct nfattr *tb[CTA_IP_MAX];
+ struct nlattr *tb[CTA_IP_MAX+1];
struct nf_conntrack_l3proto *l3proto;
int ret = 0;
- DEBUGP("entered %s\n", __FUNCTION__);
-
- nfattr_parse_nested(tb, CTA_IP_MAX, attr);
-
- l3proto = nf_ct_l3proto_find_get(tuple->src.l3num);
+ ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
+ if (ret < 0)
+ return ret;
- if (likely(l3proto->nfattr_to_tuple))
- ret = l3proto->nfattr_to_tuple(tb, tuple);
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
- nf_ct_l3proto_put(l3proto);
+ if (likely(l3proto->nlattr_to_tuple)) {
+ ret = nla_validate_nested(attr, CTA_IP_MAX,
+ l3proto->nla_policy);
+ if (ret == 0)
+ ret = l3proto->nlattr_to_tuple(tb, tuple);
+ }
- DEBUGP("leaving\n");
+ rcu_read_unlock();
return ret;
}
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_NUM-1] = sizeof(u_int8_t),
+static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_NUM] = { .type = NLA_U8 },
};
static inline int
-ctnetlink_parse_tuple_proto(struct nfattr *attr,
+ctnetlink_parse_tuple_proto(struct nlattr *attr,
struct nf_conntrack_tuple *tuple)
{
- struct nfattr *tb[CTA_PROTO_MAX];
- struct nf_conntrack_protocol *proto;
+ struct nlattr *tb[CTA_PROTO_MAX+1];
+ struct nf_conntrack_l4proto *l4proto;
int ret = 0;
- DEBUGP("entered %s\n", __FUNCTION__);
-
- nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
+ ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy);
+ if (ret < 0)
+ return ret;
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ if (!tb[CTA_PROTO_NUM])
return -EINVAL;
+ tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
- if (!tb[CTA_PROTO_NUM-1])
- return -EINVAL;
- tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
- proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum);
+ if (likely(l4proto->nlattr_to_tuple)) {
+ ret = nla_validate_nested(attr, CTA_PROTO_MAX,
+ l4proto->nla_policy);
+ if (ret == 0)
+ ret = l4proto->nlattr_to_tuple(tb, tuple);
+ }
- if (likely(proto->nfattr_to_tuple))
- ret = proto->nfattr_to_tuple(tb, tuple);
+ rcu_read_unlock();
- nf_ct_proto_put(proto);
-
return ret;
}
-static inline int
-ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple,
- enum ctattr_tuple type, u_int8_t l3num)
+static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
+ [CTA_TUPLE_IP] = { .type = NLA_NESTED },
+ [CTA_TUPLE_PROTO] = { .type = NLA_NESTED },
+};
+
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple,
+ enum ctattr_type type, u_int8_t l3num)
{
- struct nfattr *tb[CTA_TUPLE_MAX];
+ struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
- DEBUGP("entered %s\n", __FUNCTION__);
-
memset(tuple, 0, sizeof(*tuple));
- nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
+ err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
+ if (err < 0)
+ return err;
- if (!tb[CTA_TUPLE_IP-1])
+ if (!tb[CTA_TUPLE_IP])
return -EINVAL;
tuple->src.l3num = l3num;
- err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
if (err < 0)
return err;
- if (!tb[CTA_TUPLE_PROTO-1])
+ if (!tb[CTA_TUPLE_PROTO])
return -EINVAL;
- err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
if (err < 0)
return err;
@@ -580,219 +935,173 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple,
else
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- NF_CT_DUMP_TUPLE(tuple);
-
- DEBUGP("leaving\n");
-
return 0;
}
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
- [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
- [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
-};
-
-static int ctnetlink_parse_nat_proto(struct nfattr *attr,
- const struct nf_conn *ct,
- struct ip_nat_range *range)
+static int
+ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
{
- struct nfattr *tb[CTA_PROTONAT_MAX];
- struct ip_nat_protocol *npt;
-
- DEBUGP("entered %s\n", __FUNCTION__);
-
- nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
-
- if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
- return -EINVAL;
-
- npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
-
- if (!npt->nfattr_to_range) {
- ip_nat_proto_put(npt);
- return 0;
- }
-
- /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
- if (npt->nfattr_to_range(tb, range) > 0)
- range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
-
- ip_nat_proto_put(npt);
+ if (attr)
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ *zone = ntohs(nla_get_be16(attr));
+#else
+ return -EOPNOTSUPP;
+#endif
+ else
+ *zone = 0;
- DEBUGP("leaving\n");
return 0;
}
-static const size_t cta_min_nat[CTA_NAT_MAX] = {
- [CTA_NAT_MINIP-1] = sizeof(u_int32_t),
- [CTA_NAT_MAXIP-1] = sizeof(u_int32_t),
+static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
+ [CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
};
static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
- const struct nf_conn *ct, struct ip_nat_range *range)
+ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
+ struct nlattr **helpinfo)
{
- struct nfattr *tb[CTA_NAT_MAX];
int err;
+ struct nlattr *tb[CTA_HELP_MAX+1];
- DEBUGP("entered %s\n", __FUNCTION__);
-
- memset(range, 0, sizeof(*range));
-
- nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
-
- if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
- return -EINVAL;
-
- if (tb[CTA_NAT_MINIP-1])
- range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
-
- if (!tb[CTA_NAT_MAXIP-1])
- range->max_ip = range->min_ip;
- else
- range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
-
- if (range->min_ip)
- range->flags |= IP_NAT_RANGE_MAP_IPS;
-
- if (!tb[CTA_NAT_PROTO-1])
- return 0;
-
- err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+ err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
if (err < 0)
return err;
- DEBUGP("leaving\n");
- return 0;
-}
-#endif
-
-static inline int
-ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
-{
- struct nfattr *tb[CTA_HELP_MAX];
-
- DEBUGP("entered %s\n", __FUNCTION__);
-
- nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
-
- if (!tb[CTA_HELP_NAME-1])
+ if (!tb[CTA_HELP_NAME])
return -EINVAL;
- *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+ *helper_name = nla_data(tb[CTA_HELP_NAME]);
+
+ if (tb[CTA_HELP_INFO])
+ *helpinfo = tb[CTA_HELP_INFO];
return 0;
}
-static const size_t cta_min[CTA_MAX] = {
- [CTA_STATUS-1] = sizeof(u_int32_t),
- [CTA_TIMEOUT-1] = sizeof(u_int32_t),
- [CTA_MARK-1] = sizeof(u_int32_t),
- [CTA_USE-1] = sizeof(u_int32_t),
- [CTA_ID-1] = sizeof(u_int32_t)
+static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
+ [CTA_TUPLE_ORIG] = { .type = NLA_NESTED },
+ [CTA_TUPLE_REPLY] = { .type = NLA_NESTED },
+ [CTA_STATUS] = { .type = NLA_U32 },
+ [CTA_PROTOINFO] = { .type = NLA_NESTED },
+ [CTA_HELP] = { .type = NLA_NESTED },
+ [CTA_NAT_SRC] = { .type = NLA_NESTED },
+ [CTA_TIMEOUT] = { .type = NLA_U32 },
+ [CTA_MARK] = { .type = NLA_U32 },
+ [CTA_ID] = { .type = NLA_U32 },
+ [CTA_NAT_DST] = { .type = NLA_NESTED },
+ [CTA_TUPLE_MASTER] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
+ [CTA_ZONE] = { .type = NLA_U16 },
+ [CTA_MARK_MASK] = { .type = NLA_U32 },
+ [CTA_LABELS] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
+ [CTA_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
};
static int
-ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
+ struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
+ u16 zone;
+ int err;
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
- if (cda[CTA_TUPLE_ORIG-1])
+ if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
- else if (cda[CTA_TUPLE_REPLY-1])
+ else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
else {
/* Flush the whole table */
- nf_conntrack_flush();
+ nf_conntrack_flush_report(net,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
return 0;
}
if (err < 0)
return err;
- h = nf_conntrack_find_get(&tuple, NULL);
- if (!h) {
- DEBUGP("tuple not found in conntrack hash\n");
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
return -ENOENT;
- }
ct = nf_ct_tuplehash_to_ctrack(h);
-
- if (cda[CTA_ID-1]) {
- u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
- if (ct->id != id) {
+
+ if (cda[CTA_ID]) {
+ u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
+ if (id != (u32)(unsigned long)ct) {
nf_ct_put(ct);
return -ENOENT;
}
- }
+ }
+
if (del_timer(&ct->timeout))
- ct->timeout.function((unsigned long)ct);
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
nf_ct_put(ct);
- DEBUGP("leaving\n");
return 0;
}
static int
-ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
+ struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
struct sk_buff *skb2 = NULL;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
+ u16 zone;
+ int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- u32 rlen;
-
- if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
- IPCTNL_MSG_CT_GET_CTRZERO) {
-#ifdef CONFIG_NF_CT_ACCT
- if ((*errp = netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_dump_table_w,
- ctnetlink_done)) != 0)
- return -EINVAL;
-#else
- return -ENOTSUPP;
-#endif
- } else {
- if ((*errp = netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_dump_table,
- ctnetlink_done)) != 0)
- return -EINVAL;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_table,
+ .done = ctnetlink_done,
+ };
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+ struct ctnetlink_dump_filter *filter;
+
+ filter = kzalloc(sizeof(struct ctnetlink_dump_filter),
+ GFP_ATOMIC);
+ if (filter == NULL)
+ return -ENOMEM;
+
+ filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
+ filter->mark.mask =
+ ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ c.data = filter;
}
-
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
- return 0;
+#endif
+ return netlink_dump_start(ctnl, skb, nlh, &c);
}
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
- if (cda[CTA_TUPLE_ORIG-1])
+ if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
- else if (cda[CTA_TUPLE_REPLY-1])
+ else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
else
return -EINVAL;
@@ -800,151 +1109,320 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- h = nf_conntrack_find_get(&tuple, NULL);
- if (!h) {
- DEBUGP("tuple not found in conntrack hash");
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
return -ENOENT;
- }
- DEBUGP("tuple found\n");
+
ct = nf_ct_tuplehash_to_ctrack(h);
err = -ENOMEM;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb2) {
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
nf_ct_put(ct);
return -ENOMEM;
}
- NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
- IPCTNL_MSG_CT_NEW, 1, ct);
+ rcu_read_lock();
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
+ rcu_read_unlock();
nf_ct_put(ct);
if (err <= 0)
goto free;
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
if (err < 0)
goto out;
- DEBUGP("leaving\n");
return 0;
free:
kfree_skb(skb2);
out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static int ctnetlink_done_list(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_put((struct nf_conn *)cb->args[1]);
+ return 0;
+}
+
+static int
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+{
+ struct nf_conn *ct, *last;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ int cpu;
+ struct hlist_nulls_head *list;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_conn *)cb->args[1];
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ struct ct_pcpu *pcpu;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+ spin_lock_bh(&pcpu->lock);
+ list = dying ? &pcpu->dying : &pcpu->unconfirmed;
+restart:
+ hlist_nulls_for_each_entry(h, n, list, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (ct != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ rcu_read_lock();
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ continue;
+ cb->args[0] = cpu;
+ cb->args[1] = (unsigned long)ct;
+ spin_unlock_bh(&pcpu->lock);
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+ cb->args[2] = 1;
+out:
+ if (last)
+ nf_ct_put(last);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, true);
+}
+
+static int
+ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_dying,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, false);
+}
+
+static int
+ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_unconfirmed,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static int
+ctnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+ int err;
+
+ parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
+ if (!parse_nat_setup) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat") < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ return -EOPNOTSUPP;
+ }
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ if (nfnetlink_parse_nat_setup_hook)
+ return -EAGAIN;
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ err = parse_nat_setup(ct, manip, attr);
+ if (err == -EAGAIN) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ return -EOPNOTSUPP;
+ }
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+#else
+ err = -EOPNOTSUPP;
+#endif
+ }
return err;
}
+#endif
-static inline int
-ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
+static int
+ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
unsigned long d;
- unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]));
+ unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
d = ct->status ^ status;
if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
/* unchangeable */
- return -EINVAL;
-
+ return -EBUSY;
+
if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
/* SEEN_REPLY bit can only be set */
- return -EINVAL;
+ return -EBUSY;
-
if (d & IPS_ASSURED && !(status & IPS_ASSURED))
/* ASSURED bit can only be set */
- return -EINVAL;
-
- if (cda[CTA_NAT-1]) {
-#ifndef CONFIG_IP_NF_NAT_NEEDED
- return -EINVAL;
-#else
- unsigned int hooknum;
- struct ip_nat_range range;
-
- if (ctnetlink_parse_nat(cda, ct, &range) < 0)
- return -EINVAL;
-
- DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
- NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
- htons(range.min.all), htons(range.max.all));
-
- /* This is tricky but it works. ip_nat_setup_info needs the
- * hook number as parameter, so let's do the correct
- * conversion and run away */
- if (status & IPS_SRC_NAT_DONE)
- hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
- else if (status & IPS_DST_NAT_DONE)
- hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
- else
- return -EINVAL; /* Missing NAT flags */
-
- DEBUGP("NAT status: %lu\n",
- status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-
- if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
- return -EEXIST;
- ip_nat_setup_info(ct, &range, hooknum);
-
- DEBUGP("NAT status after setup_info: %lu\n",
- ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-#endif
- }
+ return -EBUSY;
/* Be careful here, modifying NAT bits can screw up things,
* so don't let users modify them directly if they don't pass
- * ip_nat_range. */
+ * nf_nat_range. */
ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
return 0;
}
+static int
+ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+ int ret;
+
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST,
+ cda[CTA_NAT_DST]);
+ if (ret < 0)
+ return ret;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
+ cda[CTA_NAT_SRC]);
+ return ret;
+#else
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
+ return -EOPNOTSUPP;
+#endif
+}
static inline int
-ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[])
+ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
{
struct nf_conntrack_helper *helper;
- char *helpname;
+ struct nf_conn_help *help = nfct_help(ct);
+ char *helpname = NULL;
+ struct nlattr *helpinfo = NULL;
int err;
- DEBUGP("entered %s\n", __FUNCTION__);
-
/* don't change helper of sibling connections */
if (ct->master)
- return -EINVAL;
+ return -EBUSY;
- err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
return err;
- helper = __nf_conntrack_helper_find_byname(helpname);
- if (!helper) {
- if (!strcmp(helpname, ""))
- helper = NULL;
- else
- return -EINVAL;
- }
-
- if (ct->helper) {
- if (!helper) {
+ if (!strcmp(helpname, "")) {
+ if (help && help->helper) {
/* we had a helper before ... */
nf_ct_remove_expectations(ct);
- ct->helper = NULL;
- } else {
- /* need to zero data of old helper */
- memset(&ct->help, 0, sizeof(ct->help));
+ RCU_INIT_POINTER(help->helper, NULL);
}
+
+ return 0;
}
-
- ct->helper = helper;
- return 0;
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+#ifdef CONFIG_MODULES
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ return -EOPNOTSUPP;
+ }
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper)
+ return -EAGAIN;
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (help) {
+ if (help->helper == helper) {
+ /* update private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
+ return 0;
+ } else
+ return -EBUSY;
+ }
+
+ /* we cannot set a helper for an existing conntrack */
+ return -EOPNOTSUPP;
}
static inline int
-ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[])
+ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
{
- u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
-
+ u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+
if (!del_timer(&ct->timeout))
return -ETIME;
@@ -954,376 +1432,1210 @@ ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[])
return 0;
}
+static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
+ [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
+ [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
+ [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
+};
+
static inline int
-ctnetlink_change_protoinfo(struct nf_conn *ct, struct nfattr *cda[])
+ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
{
- struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
- struct nf_conntrack_protocol *proto;
- u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
- u_int16_t l3num = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ const struct nlattr *attr = cda[CTA_PROTOINFO];
+ struct nlattr *tb[CTA_PROTOINFO_MAX+1];
+ struct nf_conntrack_l4proto *l4proto;
int err = 0;
- nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
-
- proto = nf_ct_proto_find_get(l3num, npt);
+ err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
+ if (err < 0)
+ return err;
- if (proto->from_nfattr)
- err = proto->from_nfattr(tb, ct);
- nf_ct_proto_put(proto);
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->from_nlattr)
+ err = l4proto->from_nlattr(tb, ct);
+ rcu_read_unlock();
return err;
}
+static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
+ [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 },
+};
+
+static inline int
+change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
+{
+ int err;
+ struct nlattr *cda[CTA_SEQADJ_MAX+1];
+
+ err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy);
+ if (err < 0)
+ return err;
+
+ if (!cda[CTA_SEQADJ_CORRECTION_POS])
+ return -EINVAL;
+
+ seq->correction_pos =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));
+
+ if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
+ return -EINVAL;
+
+ seq->offset_before =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));
+
+ if (!cda[CTA_SEQADJ_OFFSET_AFTER])
+ return -EINVAL;
+
+ seq->offset_after =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));
+
+ return 0;
+}
+
static int
-ctnetlink_change_conntrack(struct nf_conn *ct, struct nfattr *cda[])
+ctnetlink_change_seq_adj(struct nf_conn *ct,
+ const struct nlattr * const cda[])
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ int ret = 0;
+
+ if (!seqadj)
+ return 0;
+
+ if (cda[CTA_SEQ_ADJ_ORIG]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
+ cda[CTA_SEQ_ADJ_ORIG]);
+ if (ret < 0)
+ return ret;
+
+ ct->status |= IPS_SEQ_ADJUST;
+ }
+
+ if (cda[CTA_SEQ_ADJ_REPLY]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
+ cda[CTA_SEQ_ADJ_REPLY]);
+ if (ret < 0)
+ return ret;
+
+ ct->status |= IPS_SEQ_ADJUST;
+ }
+
+ return 0;
+}
+
+static int
+ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ size_t len = nla_len(cda[CTA_LABELS]);
+ const void *mask = cda[CTA_LABELS_MASK];
+
+ if (len & (sizeof(u32)-1)) /* must be multiple of u32 */
+ return -EINVAL;
+
+ if (mask) {
+ if (nla_len(cda[CTA_LABELS_MASK]) == 0 ||
+ nla_len(cda[CTA_LABELS_MASK]) != len)
+ return -EINVAL;
+ mask = nla_data(cda[CTA_LABELS_MASK]);
+ }
+
+ len /= sizeof(u32);
+
+ return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int
+ctnetlink_change_conntrack(struct nf_conn *ct,
+ const struct nlattr * const cda[])
{
int err;
- DEBUGP("entered %s\n", __FUNCTION__);
+ /* only allow NAT changes and master assignation for new conntracks */
+ if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER])
+ return -EOPNOTSUPP;
- if (cda[CTA_HELP-1]) {
+ if (cda[CTA_HELP]) {
err = ctnetlink_change_helper(ct, cda);
if (err < 0)
return err;
}
- if (cda[CTA_TIMEOUT-1]) {
+ if (cda[CTA_TIMEOUT]) {
err = ctnetlink_change_timeout(ct, cda);
if (err < 0)
return err;
}
- if (cda[CTA_STATUS-1]) {
+ if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
return err;
}
- if (cda[CTA_PROTOINFO-1]) {
+ if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
return err;
}
-#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (cda[CTA_MARK-1])
- ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK])
+ ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
- DEBUGP("all done\n");
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
return 0;
}
-static int
-ctnetlink_create_conntrack(struct nfattr *cda[],
+static struct nf_conn *
+ctnetlink_create_conntrack(struct net *net, u16 zone,
+ const struct nlattr * const cda[],
struct nf_conntrack_tuple *otuple,
- struct nf_conntrack_tuple *rtuple)
+ struct nf_conntrack_tuple *rtuple,
+ u8 u3)
{
struct nf_conn *ct;
int err = -EINVAL;
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_tstamp *tstamp;
- DEBUGP("entered %s\n", __FUNCTION__);
-
- ct = nf_conntrack_alloc(otuple, rtuple);
- if (ct == NULL || IS_ERR(ct))
- return -ENOMEM;
+ ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
+ if (IS_ERR(ct))
+ return ERR_PTR(-ENOMEM);
- if (!cda[CTA_TIMEOUT-1])
- goto err;
- ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+ if (!cda[CTA_TIMEOUT])
+ goto err1;
+ ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
- ct->status |= IPS_CONFIRMED;
- err = ctnetlink_change_status(ct, cda);
+ rcu_read_lock();
+ if (cda[CTA_HELP]) {
+ char *helpname = NULL;
+ struct nlattr *helpinfo = NULL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
+ if (err < 0)
+ goto err2;
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+ rcu_read_unlock();
+#ifdef CONFIG_MODULES
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ err = -EOPNOTSUPP;
+ goto err1;
+ }
+
+ rcu_read_lock();
+ helper = __nf_conntrack_helper_find(helpname,
+ nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper) {
+ err = -EAGAIN;
+ goto err2;
+ }
+ rcu_read_unlock();
+#endif
+ err = -EOPNOTSUPP;
+ goto err1;
+ } else {
+ struct nf_conn_help *help;
+
+ help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
+ if (help == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ /* set private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
+
+ /* not in hash table yet so not strictly necessary */
+ RCU_INIT_POINTER(help->helper, helper);
+ }
+ } else {
+ /* try an implicit helper assignation */
+ err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+ if (err < 0)
+ goto err2;
+ }
+
+ err = ctnetlink_setup_nat(ct, cda);
if (err < 0)
- goto err;
+ goto err2;
+
+ nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+ nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
+
+ /* we must add conntrack extensions before confirmation. */
+ ct->status |= IPS_CONFIRMED;
+
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ goto err2;
+ }
+
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
+ if (err < 0)
+ goto err2;
+ }
- if (cda[CTA_PROTOINFO-1]) {
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
- return err;
+ goto err2;
}
-#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (cda[CTA_MARK-1])
- ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK])
+ ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
- ct->helper = nf_ct_helper_find_get(rtuple);
+ /* setup master conntrack: this is a confirmed expectation */
+ if (cda[CTA_TUPLE_MASTER]) {
+ struct nf_conntrack_tuple master;
+ struct nf_conntrack_tuple_hash *master_h;
+ struct nf_conn *master_ct;
- add_timer(&ct->timeout);
- nf_conntrack_hash_insert(ct);
+ err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+ if (err < 0)
+ goto err2;
- if (ct->helper)
- nf_ct_helper_put(ct->helper);
+ master_h = nf_conntrack_find_get(net, zone, &master);
+ if (master_h == NULL) {
+ err = -ENOENT;
+ goto err2;
+ }
+ master_ct = nf_ct_tuplehash_to_ctrack(master_h);
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
+ ct->master = master_ct;
+ }
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_to_ns(ktime_get_real());
- DEBUGP("conntrack with id %u inserted\n", ct->id);
- return 0;
+ err = nf_conntrack_hash_check_insert(ct);
+ if (err < 0)
+ goto err2;
+
+ rcu_read_unlock();
+
+ return ct;
-err:
+err2:
+ rcu_read_unlock();
+err1:
nf_conntrack_free(ct);
- return err;
+ return ERR_PTR(err);
}
-static int
-ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
+ struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_conn *ct;
u_int8_t u3 = nfmsg->nfgen_family;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
+ u16 zone;
+ int err;
- if (nfattr_bad_size(cda, CTA_MAX, cta_min))
- return -EINVAL;
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
- if (cda[CTA_TUPLE_ORIG-1]) {
+ if (cda[CTA_TUPLE_ORIG]) {
err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
if (err < 0)
return err;
}
- if (cda[CTA_TUPLE_REPLY-1]) {
+ if (cda[CTA_TUPLE_REPLY]) {
err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
if (err < 0)
return err;
}
- write_lock_bh(&nf_conntrack_lock);
- if (cda[CTA_TUPLE_ORIG-1])
- h = __nf_conntrack_find(&otuple, NULL);
- else if (cda[CTA_TUPLE_REPLY-1])
- h = __nf_conntrack_find(&rtuple, NULL);
+ if (cda[CTA_TUPLE_ORIG])
+ h = nf_conntrack_find_get(net, zone, &otuple);
+ else if (cda[CTA_TUPLE_REPLY])
+ h = nf_conntrack_find_get(net, zone, &rtuple);
if (h == NULL) {
- write_unlock_bh(&nf_conntrack_lock);
- DEBUGP("no such conntrack, create new\n");
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE)
- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+ if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ enum ip_conntrack_events events;
+
+ if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
+ return -EINVAL;
+
+ ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+ &rtuple, u3);
+ if (IS_ERR(ct))
+ return PTR_ERR(ct);
+
+ err = 0;
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ events = IPCT_RELATED;
+ else
+ events = IPCT_NEW;
+
+ if (cda[CTA_LABELS] &&
+ ctnetlink_attach_labels(ct, cda) == 0)
+ events |= (1 << IPCT_LABEL);
+
+ nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+ (1 << IPCT_ASSURED) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_SEQADJ) |
+ (1 << IPCT_MARK) | events,
+ ct, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_put(ct);
+ }
+
return err;
}
/* implicit 'else' */
- /* we only allow nat config for new conntracks */
- if (cda[CTA_NAT-1]) {
- err = -EINVAL;
- goto out_unlock;
- }
-
- /* We manipulate the conntrack inside the global conntrack table lock,
- * so there's no need to increase the refcount */
- DEBUGP("conntrack found\n");
err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
- err = ctnetlink_change_conntrack(nf_ct_tuplehash_to_ctrack(h), cda);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ err = ctnetlink_change_conntrack(ct, cda);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ if (err == 0) {
+ nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+ (1 << IPCT_ASSURED) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_LABEL) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_SEQADJ) |
+ (1 << IPCT_MARK),
+ ct, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ }
-out_unlock:
- write_unlock_bh(&nf_conntrack_lock);
+ nf_ct_put(ct);
return err;
}
-/***********************************************************************
- * EXPECT
- ***********************************************************************/
+static int
+ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ __u16 cpu, const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
+ nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
+ nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+ nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
+ nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
+ htonl(st->insert_failed)) ||
+ nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) ||
+ nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
+ nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
+ nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
+ htonl(st->search_restart)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_ct_stat_cpu_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_ct_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
+static int
+ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct net *net)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct sk_buff *skb2;
+ int err;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ sock_net(skb->sk));
+ if (err <= 0)
+ goto free;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ return 0;
+
+free:
+ kfree_skb(skb2);
+out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+ [CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
+ [CTA_EXPECT_MASK] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [CTA_EXPECT_ID] = { .type = NLA_U32 },
+ [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
+ [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
+ [CTA_EXPECT_CLASS] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT] = { .type = NLA_NESTED },
+ [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING },
+};
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask);
+
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+static size_t
+ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+{
+ return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ + nla_total_size(0) /* CTA_PROTOINFO */
+ + nla_total_size(0) /* CTA_HELP */
+ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+ + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ + ctnetlink_proto_size(ct)
+ ;
+}
+
+static int
+ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ rcu_read_lock();
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct)) {
+ if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+ }
+
+ if (ctnetlink_dump_id(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ct->master && ctnetlink_dump_master(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if ((ct->status & IPS_SEQ_ADJUST) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
+ rcu_read_unlock();
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ return -ENOSPC;
+}
+
+static int
+ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
+{
+ int err;
+
+ if (cda[CTA_TIMEOUT]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_HELP]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
+ if (err < 0)
+ return err;
+ }
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK]) {
+ u32 mask = 0, mark, newmark;
+ if (cda[CTA_MARK_MASK])
+ mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+
+ mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ newmark = (ct->mark & mask) ^ mark;
+ if (newmark != ct->mark)
+ ct->mark = newmark;
+ }
+#endif
+ return 0;
+}
+
+static int
+ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nlattr *cda[CTA_MAX+1];
+ int ret;
+
+ ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ return ret;
+}
+
+static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
+ const struct nf_conn *ct,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ int err;
+
+ err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
+ nf_ct_l3num(ct));
+ if (err < 0)
+ return err;
+
+ return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
+ nf_ct_l3num(ct));
+}
+
+static int
+ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+ u32 portid, u32 report)
+{
+ struct nlattr *cda[CTA_EXPECT_MAX+1];
+ struct nf_conntrack_tuple tuple, mask;
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conntrack_expect *exp;
+ int err;
+
+ err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
+ ct, &tuple, &mask);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL)
+ return -EOPNOTSUPP;
+ }
+
+ exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
+ helper, &tuple, &mask);
+ if (IS_ERR(exp))
+ return PTR_ERR(exp);
+
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0) {
+ nf_ct_expect_put(exp);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
+ .build_size = ctnetlink_nfqueue_build_size,
+ .build = ctnetlink_nfqueue_build,
+ .parse = ctnetlink_nfqueue_parse,
+ .attach_expect = ctnetlink_nfqueue_attach_expect,
+ .seq_adjust = nf_ct_tcp_seqadj_set,
+};
+#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
static inline int
ctnetlink_exp_dump_tuple(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
enum ctattr_expect type)
{
- struct nfattr *nest_parms = NFA_NEST(skb, type);
-
- if (ctnetlink_dump_tuples(skb, tuple) < 0)
- goto nfattr_failure;
+ struct nlattr *nest_parms;
- NFA_NEST_END(skb, nest_parms);
+ nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, tuple) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
-}
+}
static inline int
+ctnetlink_exp_dump_mask(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple_mask *mask)
+{
+ int ret;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple m;
+ struct nlattr *nest_parms;
+
+ memset(&m, 0xFF, sizeof(m));
+ memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
+ m.src.u.all = mask->src.u.all;
+ m.dst.protonum = tuple->dst.protonum;
+
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
+ }
+ rcu_read_unlock();
+
+ if (unlikely(ret < 0))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const union nf_inet_addr any_addr;
+
+static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
- const struct nf_conntrack_expect *exp)
+ const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
- u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
- u_int32_t id = htonl(exp->id);
+ long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
+ struct nf_conn_help *help;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *nest_parms;
+ struct nf_conntrack_tuple nat_tuple = {};
+#endif
+ struct nf_ct_helper_expectfn *expfn;
+
+ if (timeout < 0)
+ timeout = 0;
if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
- goto nfattr_failure;
- if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
- goto nfattr_failure;
+ goto nla_put_failure;
+ if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
+ goto nla_put_failure;
if (ctnetlink_exp_dump_tuple(skb,
&master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
CTA_EXPECT_MASTER) < 0)
- goto nfattr_failure;
-
- NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
- NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
+ exp->saved_proto.all) {
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)))
+ goto nla_put_failure;
+
+ nat_tuple.src.l3num = nf_ct_l3num(master);
+ nat_tuple.src.u3 = exp->saved_addr;
+ nat_tuple.dst.protonum = nf_ct_protonum(master);
+ nat_tuple.src.u = exp->saved_proto;
+
+ if (ctnetlink_exp_dump_tuple(skb, &nat_tuple,
+ CTA_EXPECT_NAT_TUPLE) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+ }
+#endif
+ if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
+ nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
+ nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
+ goto nla_put_failure;
+ help = nfct_help(master);
+ if (help) {
+ struct nf_conntrack_helper *helper;
+
+ helper = rcu_dereference(help->helper);
+ if (helper &&
+ nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name))
+ goto nla_put_failure;
+ }
+ expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn);
+ if (expfn != NULL &&
+ nla_put_string(skb, CTA_EXPECT_FN, expfn->name))
+ goto nla_put_failure;
return 0;
-
-nfattr_failure:
+
+nla_put_failure:
return -1;
}
static int
-ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
- int event,
- int nowait,
- const struct nf_conntrack_expect *exp)
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- unsigned char *b;
-
- b = skb->tail;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = exp->tuple.src.l3num;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
- goto nfattr_failure;
+ goto nla_put_failure;
- nlh->nlmsg_len = skb->tail - b;
+ nlmsg_end(skb, nlh);
return skb->len;
nlmsg_failure:
-nfattr_failure:
- skb_trim(skb, b - skb->data);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
return -1;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
-static int ctnetlink_expect_event(struct notifier_block *this,
- unsigned long events, void *ptr)
+static int
+ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
{
+ struct nf_conntrack_expect *exp = item->exp;
+ struct net *net = nf_ct_exp_net(exp);
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr;
struct sk_buff *skb;
- unsigned int type;
- unsigned char *b;
+ unsigned int type, group;
int flags = 0;
- if (events & IPEXP_NEW) {
+ if (events & (1 << IPEXP_DESTROY)) {
+ type = IPCTNL_MSG_EXP_DELETE;
+ group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+ } else if (events & (1 << IPEXP_NEW)) {
type = IPCTNL_MSG_EXP_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
+ group = NFNLGRP_CONNTRACK_EXP_NEW;
} else
- return NOTIFY_DONE;
+ return 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
- if (!skb)
- return NOTIFY_DONE;
+ if (!item->report && !nfnetlink_has_listeners(net, group))
+ return 0;
- b = skb->tail;
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
- nlh->nlmsg_flags = flags;
+ nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = exp->tuple.src.l3num;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
+ rcu_read_lock();
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
- goto nfattr_failure;
+ goto nla_put_failure;
+ rcu_read_unlock();
- nlh->nlmsg_len = skb->tail - b;
- nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
- return NOTIFY_DONE;
+ nlmsg_end(skb, nlh);
+ nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);
+ return 0;
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_cancel(skb, nlh);
nlmsg_failure:
-nfattr_failure:
kfree_skb(skb);
- return NOTIFY_DONE;
+errout:
+ nfnetlink_set_err(net, 0, 0, -ENOBUFS);
+ return 0;
}
#endif
+static int ctnetlink_exp_done(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+ return 0;
+}
static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conntrack_expect *exp = NULL;
- struct list_head *i;
- u_int32_t *id = (u_int32_t *) &cb->args[0];
- struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+ for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+restart:
+ hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
+ hnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ }
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
u_int8_t l3proto = nfmsg->nfgen_family;
- DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+ if (cb->args[0])
+ return 0;
- read_lock_bh(&nf_conntrack_lock);
- list_for_each_prev(i, &nf_conntrack_expect_list) {
- exp = (struct nf_conntrack_expect *) i;
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+restart:
+ hlist_for_each_entry(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
- if (exp->id <= *id)
- continue;
- if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
- 1, exp) < 0)
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
goto out;
- *id = exp->id;
+ }
}
-out:
- read_unlock_bh(&nf_conntrack_lock);
-
- DEBUGP("leaving, last id=%llu\n", *id);
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ cb->args[0] = 1;
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
return skb->len;
}
-static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
- [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t),
- [CTA_EXPECT_ID-1] = sizeof(u_int32_t)
-};
+static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int err;
+ struct net *net = sock_net(ctnl);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = 0;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+ }
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ c.data = ct;
+
+ err = netlink_dump_start(ctnl, skb, nlh, &c);
+ nf_ct_put(ct);
+
+ return err;
+}
static int
-ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
+ struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
struct sk_buff *skb2;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
-
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
- return -EINVAL;
+ u16 zone;
+ int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- u32 rlen;
-
- if ((*errp = netlink_dump_start(ctnl, skb, nlh,
- ctnetlink_exp_dump_table,
- ctnetlink_done)) != 0)
- return -EINVAL;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
- return 0;
+ if (cda[CTA_EXPECT_MASTER])
+ return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
}
- if (cda[CTA_EXPECT_MASTER-1])
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_TUPLE])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ else if (cda[CTA_EXPECT_MASTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
else
return -EINVAL;
@@ -1331,130 +2643,272 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- exp = nf_conntrack_expect_find(&tuple);
+ exp = nf_ct_expect_find_get(net, zone, &tuple);
if (!exp)
return -ENOENT;
- if (cda[CTA_EXPECT_ID-1]) {
- u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
- if (exp->id != ntohl(id)) {
- nf_conntrack_expect_put(exp);
+ if (cda[CTA_EXPECT_ID]) {
+ __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+ if (ntohl(id) != (u32)(unsigned long)exp) {
+ nf_ct_expect_put(exp);
return -ENOENT;
}
- }
+ }
err = -ENOMEM;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb2)
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ nf_ct_expect_put(exp);
goto out;
- NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
-
- err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
- nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
- 1, exp);
+ }
+
+ rcu_read_lock();
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+ rcu_read_unlock();
+ nf_ct_expect_put(exp);
if (err <= 0)
goto free;
- nf_conntrack_expect_put(exp);
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
- return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ return 0;
free:
kfree_skb(skb2);
out:
- nf_conntrack_expect_put(exp);
- return err;
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
}
static int
-ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct nf_conntrack_expect *exp, *tmp;
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
- struct nf_conntrack_helper *h;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct hlist_node *next;
u_int8_t u3 = nfmsg->nfgen_family;
+ unsigned int i;
+ u16 zone;
int err;
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
- return -EINVAL;
-
- if (cda[CTA_EXPECT_TUPLE-1]) {
+ if (cda[CTA_EXPECT_TUPLE]) {
/* delete a single expect by tuple */
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
if (err < 0)
return err;
/* bump usage count to 2 */
- exp = nf_conntrack_expect_find(&tuple);
+ exp = nf_ct_expect_find_get(net, zone, &tuple);
if (!exp)
return -ENOENT;
- if (cda[CTA_EXPECT_ID-1]) {
- u_int32_t id =
- *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
- if (exp->id != ntohl(id)) {
- nf_conntrack_expect_put(exp);
+ if (cda[CTA_EXPECT_ID]) {
+ __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+ if (ntohl(id) != (u32)(unsigned long)exp) {
+ nf_ct_expect_put(exp);
return -ENOENT;
}
}
/* after list removal, usage count == 1 */
- nf_conntrack_unexpect_related(exp);
- /* have to put what we 'get' above.
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ /* have to put what we 'get' above.
* after this line usage count == 0 */
- nf_conntrack_expect_put(exp);
- } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
- char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+ nf_ct_expect_put(exp);
+ } else if (cda[CTA_EXPECT_HELP_NAME]) {
+ char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+ struct nf_conn_help *m_help;
/* delete all expectations for this helper */
- write_lock_bh(&nf_conntrack_lock);
- h = __nf_conntrack_helper_find_byname(name);
- if (!h) {
- write_unlock_bh(&nf_conntrack_lock);
- return -EINVAL;
- }
- list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list,
- list) {
- if (exp->master->helper == h
- && del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
- nf_conntrack_expect_put(exp);
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i],
+ hnode) {
+ m_help = nfct_help(exp->master);
+ if (!strcmp(m_help->helper->name, name) &&
+ del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
}
}
- write_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
} else {
/* This basically means we have to flush everything*/
- write_lock_bh(&nf_conntrack_lock);
- list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list,
- list) {
- if (del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
- nf_conntrack_expect_put(exp);
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i],
+ hnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
}
}
- write_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
return 0;
}
static int
-ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[])
+ctnetlink_change_expect(struct nf_conntrack_expect *x,
+ const struct nlattr * const cda[])
{
+ if (cda[CTA_EXPECT_TIMEOUT]) {
+ if (!del_timer(&x->timeout))
+ return -ETIME;
+
+ x->timeout.expires = jiffies +
+ ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+ add_timer(&x->timeout);
+ }
+ return 0;
+}
+
+static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
+ [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
+};
+
+static int
+ctnetlink_parse_expect_nat(const struct nlattr *attr,
+ struct nf_conntrack_expect *exp,
+ u_int8_t u3)
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
+ struct nf_conntrack_tuple nat_tuple = {};
+ int err;
+
+ err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
+ &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+ if (err < 0)
+ return err;
+
+ exp->saved_addr = nat_tuple.src.u3;
+ exp->saved_proto = nat_tuple.src.u;
+ exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR]));
+
+ return 0;
+#else
return -EOPNOTSUPP;
+#endif
+}
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ u_int32_t class = 0;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn_help *help;
+ int err;
+
+ if (cda[CTA_EXPECT_CLASS] && helper) {
+ class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
+ if (class > helper->expect_class_max)
+ return ERR_PTR(-EINVAL);
+ }
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return ERR_PTR(-ENOMEM);
+
+ help = nfct_help(ct);
+ if (!help) {
+ if (!cda[CTA_EXPECT_TIMEOUT]) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ exp->timeout.expires =
+ jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+ exp->flags = NF_CT_EXPECT_USERSPACE;
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags |=
+ ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ }
+ } else {
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+ } else
+ exp->flags = 0;
+ }
+ if (cda[CTA_EXPECT_FN]) {
+ const char *name = nla_data(cda[CTA_EXPECT_FN]);
+ struct nf_ct_helper_expectfn *expfn;
+
+ expfn = nf_ct_helper_expectfn_find_by_name(name);
+ if (expfn == NULL) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ exp->expectfn = expfn->expectfn;
+ } else
+ exp->expectfn = NULL;
+
+ exp->class = class;
+ exp->master = ct;
+ exp->helper = helper;
+ exp->tuple = *tuple;
+ exp->mask.src.u3 = mask->src.u3;
+ exp->mask.src.u.all = mask->src.u.all;
+
+ if (cda[CTA_EXPECT_NAT]) {
+ err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
+ exp, nf_ct_l3num(ct));
+ if (err < 0)
+ goto err_out;
+ }
+ return exp;
+err_out:
+ nf_ct_expect_put(exp);
+ return ERR_PTR(err);
}
static int
-ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3)
+ctnetlink_create_expect(struct net *net, u16 zone,
+ const struct nlattr * const cda[],
+ u_int8_t u3, u32 portid, int report)
{
struct nf_conntrack_tuple tuple, mask, master_tuple;
struct nf_conntrack_tuple_hash *h = NULL;
+ struct nf_conntrack_helper *helper = NULL;
struct nf_conntrack_expect *exp;
struct nf_conn *ct;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
+ int err;
/* caller guarantees that those three CTA_EXPECT_* exist */
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
@@ -1468,168 +2922,312 @@ ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3)
return err;
/* Look for master conntrack of this expectation */
- h = nf_conntrack_find_get(&master_tuple, NULL);
+ h = nf_conntrack_find_get(net, zone, &master_tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!ct->helper) {
- /* such conntrack hasn't got any helper, abort */
- err = -EINVAL;
- goto out;
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+#ifdef CONFIG_MODULES
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper) {
+ err = -EAGAIN;
+ goto err_ct;
+ }
+#endif
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
}
- exp = nf_conntrack_expect_alloc(ct);
- if (!exp) {
- err = -ENOMEM;
- goto out;
+ exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
+ if (IS_ERR(exp)) {
+ err = PTR_ERR(exp);
+ goto err_ct;
}
-
- exp->expectfn = NULL;
- exp->flags = 0;
- exp->master = ct;
- memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
- memcpy(&exp->mask, &mask, sizeof(struct nf_conntrack_tuple));
- err = nf_conntrack_expect_related(exp);
- nf_conntrack_expect_put(exp);
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0)
+ goto err_exp;
-out:
- nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
+ return 0;
+err_exp:
+ nf_ct_expect_put(exp);
+err_ct:
+ nf_ct_put(ct);
return err;
}
static int
ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
+ struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- int err = 0;
-
- DEBUGP("entered %s\n", __FUNCTION__);
+ u16 zone;
+ int err;
- if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ if (!cda[CTA_EXPECT_TUPLE]
+ || !cda[CTA_EXPECT_MASK]
+ || !cda[CTA_EXPECT_MASTER])
return -EINVAL;
- if (!cda[CTA_EXPECT_TUPLE-1]
- || !cda[CTA_EXPECT_MASK-1]
- || !cda[CTA_EXPECT_MASTER-1])
- return -EINVAL;
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
if (err < 0)
return err;
- write_lock_bh(&nf_conntrack_lock);
- exp = __nf_conntrack_expect_find(&tuple);
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = __nf_ct_expect_find(net, zone, &tuple);
if (!exp) {
- write_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE)
- err = ctnetlink_create_expect(cda, u3);
+ if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ err = ctnetlink_create_expect(net, zone, cda,
+ u3,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
return err;
}
err = -EEXIST;
if (!(nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
- write_unlock_bh(&nf_conntrack_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
- DEBUGP("leaving\n");
-
return err;
}
+static int
+ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
+ const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_NF_CONNTRACK_EVENTS
-static struct notifier_block ctnl_notifier = {
- .notifier_call = ctnetlink_conntrack_event,
+static struct nf_ct_event_notifier ctnl_notifier = {
+ .fcn = ctnetlink_conntrack_event,
};
-static struct notifier_block ctnl_notifier_exp = {
- .notifier_call = ctnetlink_expect_event,
+static struct nf_exp_event_notifier ctnl_notifier_exp = {
+ .fcn = ctnetlink_expect_event,
};
#endif
-static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX, },
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
[IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX, },
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
[IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX, },
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX, },
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
+ [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
+ [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
};
-static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
[IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX, },
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
[IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX, },
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX, },
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
};
-static struct nfnetlink_subsystem ctnl_subsys = {
+static const struct nfnetlink_subsystem ctnl_subsys = {
.name = "conntrack",
.subsys_id = NFNL_SUBSYS_CTNETLINK,
.cb_count = IPCTNL_MSG_MAX,
.cb = ctnl_cb,
};
-static struct nfnetlink_subsystem ctnl_exp_subsys = {
+static const struct nfnetlink_subsystem ctnl_exp_subsys = {
.name = "conntrack_expect",
.subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
.cb_count = IPCTNL_MSG_EXP_MAX,
.cb = ctnl_exp_cb,
};
+MODULE_ALIAS("ip_conntrack_netlink");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
+static int __net_init ctnetlink_net_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int ret;
+
+ ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register notifier.\n");
+ goto err_out;
+ }
+
+ ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+ return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+err_out:
+ return ret;
+#endif
+}
+
+static void ctnetlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+#endif
+}
+
+static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ctnetlink_net_exit(net);
+}
+
+static struct pernet_operations ctnetlink_net_ops = {
+ .init = ctnetlink_net_init,
+ .exit_batch = ctnetlink_net_exit_batch,
+};
+
static int __init ctnetlink_init(void)
{
int ret;
- printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+ pr_info("ctnetlink v%s: registering with nfnetlink.\n", version);
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
- printk("ctnetlink_init: cannot register with nfnetlink.\n");
+ pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
goto err_out;
}
ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
if (ret < 0) {
- printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+ pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n");
goto err_unreg_subsys;
}
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- ret = nf_conntrack_register_notifier(&ctnl_notifier);
+ ret = register_pernet_subsys(&ctnetlink_net_ops);
if (ret < 0) {
- printk("ctnetlink_init: cannot register notifier.\n");
+ pr_err("ctnetlink_init: cannot register pernet operations\n");
goto err_unreg_exp_subsys;
}
-
- ret = nf_conntrack_expect_register_notifier(&ctnl_notifier_exp);
- if (ret < 0) {
- printk("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ /* setup interaction between nf_queue and nf_conntrack_netlink. */
+ RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
#endif
-
return 0;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- nf_conntrack_unregister_notifier(&ctnl_notifier);
err_unreg_exp_subsys:
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
-#endif
err_unreg_subsys:
nfnetlink_subsys_unregister(&ctnl_subsys);
err_out:
@@ -1638,16 +3236,14 @@ err_out:
static void __exit ctnetlink_exit(void)
{
- printk("ctnetlink: unregistering from nfnetlink.\n");
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- nf_conntrack_unregister_notifier(&ctnl_notifier_exp);
- nf_conntrack_unregister_notifier(&ctnl_notifier);
-#endif
+ pr_info("ctnetlink: unregistering from nfnetlink.\n");
+ unregister_pernet_subsys(&ctnetlink_net_ops);
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
nfnetlink_subsys_unregister(&ctnl_subsys);
- return;
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ RCU_INIT_POINTER(nfq_ct_hook, NULL);
+#endif
}
module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
new file mode 100644
index 00000000000..825c3e3f830
--- /dev/null
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -0,0 +1,619 @@
+/*
+ * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Limitations:
+ * - We blindly assume that control connections are always
+ * established in PNS->PAC direction. This is a violation
+ * of RFC 2637
+ * - We can only support one single call within each session
+ * TODO:
+ * - testing of incoming PPTP calls
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_CT_PPTP_VERSION "3.1"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
+MODULE_ALIAS("ip_conntrack_pptp");
+MODULE_ALIAS_NFCT_HELPER("pptp");
+
+static DEFINE_SPINLOCK(nf_pptp_lock);
+
+int
+(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
+
+int
+(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
+
+void
+(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
+ struct nf_conntrack_expect *expect_reply)
+ __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
+
+void
+(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+/* PptpControlMessageType names */
+const char *const pptp_msg_name[] = {
+ "UNKNOWN_MESSAGE",
+ "START_SESSION_REQUEST",
+ "START_SESSION_REPLY",
+ "STOP_SESSION_REQUEST",
+ "STOP_SESSION_REPLY",
+ "ECHO_REQUEST",
+ "ECHO_REPLY",
+ "OUT_CALL_REQUEST",
+ "OUT_CALL_REPLY",
+ "IN_CALL_REQUEST",
+ "IN_CALL_REPLY",
+ "IN_CALL_CONNECT",
+ "CALL_CLEAR_REQUEST",
+ "CALL_DISCONNECT_NOTIFY",
+ "WAN_ERROR_NOTIFY",
+ "SET_LINK_INFO"
+};
+EXPORT_SYMBOL(pptp_msg_name);
+#endif
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+
+#define PPTP_GRE_TIMEOUT (10 MINS)
+#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
+
+static void pptp_expectfn(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct net *net = nf_ct_net(ct);
+ typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
+ pr_debug("increasing timeouts\n");
+
+ /* increase timeout of GRE data channel conntrack entry */
+ ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
+ ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
+
+ /* Can you see how rusty this code is, compared with the pre-2.6.11
+ * one? That's what happened to my shiny newnat of 2002 ;( -HW */
+
+ rcu_read_lock();
+ nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
+ if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
+ nf_nat_pptp_expectfn(ct, exp);
+ else {
+ struct nf_conntrack_tuple inv_t;
+ struct nf_conntrack_expect *exp_other;
+
+ /* obviously this tuple inversion only works until you do NAT */
+ nf_ct_invert_tuplepr(&inv_t, &exp->tuple);
+ pr_debug("trying to unexpect other dir: ");
+ nf_ct_dump_tuple(&inv_t);
+
+ exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t);
+ if (exp_other) {
+ /* delete other expectation. */
+ pr_debug("found\n");
+ nf_ct_unexpect_related(exp_other);
+ nf_ct_expect_put(exp_other);
+ } else {
+ pr_debug("not found\n");
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
+ const struct nf_conntrack_tuple *t)
+{
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn *sibling;
+ u16 zone = nf_ct_zone(ct);
+
+ pr_debug("trying to timeout ct or exp for tuple ");
+ nf_ct_dump_tuple(t);
+
+ h = nf_conntrack_find_get(net, zone, t);
+ if (h) {
+ sibling = nf_ct_tuplehash_to_ctrack(h);
+ pr_debug("setting timeout of conntrack %p to 0\n", sibling);
+ sibling->proto.gre.timeout = 0;
+ sibling->proto.gre.stream_timeout = 0;
+ if (del_timer(&sibling->timeout))
+ sibling->timeout.function((unsigned long)sibling);
+ nf_ct_put(sibling);
+ return 1;
+ } else {
+ exp = nf_ct_expect_find_get(net, zone, t);
+ if (exp) {
+ pr_debug("unexpect_related of expect %p\n", exp);
+ nf_ct_unexpect_related(exp);
+ nf_ct_expect_put(exp);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* timeout GRE data connections */
+static void pptp_destroy_siblings(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ struct nf_conntrack_tuple t;
+
+ nf_ct_gre_keymap_destroy(ct);
+
+ /* try original (pns->pac) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pac_call_id;
+ if (!destroy_sibling_or_exp(net, ct, &t))
+ pr_debug("failed to timeout original pns->pac ct/exp\n");
+
+ /* try reply (pac->pns) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ if (!destroy_sibling_or_exp(net, ct, &t))
+ pr_debug("failed to timeout reply pac->pns ct/exp\n");
+}
+
+/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
+static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
+{
+ struct nf_conntrack_expect *exp_orig, *exp_reply;
+ enum ip_conntrack_dir dir;
+ int ret = 1;
+ typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
+
+ exp_orig = nf_ct_expect_alloc(ct);
+ if (exp_orig == NULL)
+ goto out;
+
+ exp_reply = nf_ct_expect_alloc(ct);
+ if (exp_reply == NULL)
+ goto out_put_orig;
+
+ /* original direction, PNS->PAC */
+ dir = IP_CT_DIR_ORIGINAL;
+ nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[dir].tuple.dst.u3,
+ IPPROTO_GRE, &peer_callid, &callid);
+ exp_orig->expectfn = pptp_expectfn;
+
+ /* reply direction, PAC->PNS */
+ dir = IP_CT_DIR_REPLY;
+ nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[dir].tuple.dst.u3,
+ IPPROTO_GRE, &callid, &peer_callid);
+ exp_reply->expectfn = pptp_expectfn;
+
+ nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
+ if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
+ nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ if (nf_ct_expect_related(exp_orig) != 0)
+ goto out_put_both;
+ if (nf_ct_expect_related(exp_reply) != 0)
+ goto out_unexpect_orig;
+
+ /* Add GRE keymap entries */
+ if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0)
+ goto out_unexpect_both;
+ if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) {
+ nf_ct_gre_keymap_destroy(ct);
+ goto out_unexpect_both;
+ }
+ ret = 0;
+
+out_put_both:
+ nf_ct_expect_put(exp_reply);
+out_put_orig:
+ nf_ct_expect_put(exp_orig);
+out:
+ return ret;
+
+out_unexpect_both:
+ nf_ct_unexpect_related(exp_reply);
+out_unexpect_orig:
+ nf_ct_unexpect_related(exp_orig);
+ goto out_put_both;
+}
+
+static inline int
+pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq,
+ unsigned int reqlen,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ u_int16_t msg;
+ __be16 cid = 0, pcid = 0;
+ typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
+
+ msg = ntohs(ctlh->messageType);
+ pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REPLY:
+ /* server confirms new control session */
+ if (info->sstate < PPTP_SESSION_REQUESTED)
+ goto invalid;
+ if (pptpReq->srep.resultCode == PPTP_START_OK)
+ info->sstate = PPTP_SESSION_CONFIRMED;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_STOP_SESSION_REPLY:
+ /* server confirms end of control session */
+ if (info->sstate > PPTP_SESSION_STOPREQ)
+ goto invalid;
+ if (pptpReq->strep.resultCode == PPTP_STOP_OK)
+ info->sstate = PPTP_SESSION_NONE;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_OUT_CALL_REPLY:
+ /* server accepted call, we now expect GRE frames */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ if (info->cstate != PPTP_CALL_OUT_REQ &&
+ info->cstate != PPTP_CALL_OUT_CONF)
+ goto invalid;
+
+ cid = pptpReq->ocack.callID;
+ pcid = pptpReq->ocack.peersCallID;
+ if (info->pns_call_id != pcid)
+ goto invalid;
+ pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ ntohs(cid), ntohs(pcid));
+
+ if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
+ info->cstate = PPTP_CALL_OUT_CONF;
+ info->pac_call_id = cid;
+ exp_gre(ct, cid, pcid);
+ } else
+ info->cstate = PPTP_CALL_NONE;
+ break;
+
+ case PPTP_IN_CALL_REQUEST:
+ /* server tells us about incoming call request */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+
+ cid = pptpReq->icreq.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->cstate = PPTP_CALL_IN_REQ;
+ info->pac_call_id = cid;
+ break;
+
+ case PPTP_IN_CALL_CONNECT:
+ /* server tells us about incoming call established */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ if (info->cstate != PPTP_CALL_IN_REP &&
+ info->cstate != PPTP_CALL_IN_CONF)
+ goto invalid;
+
+ pcid = pptpReq->iccon.peersCallID;
+ cid = info->pac_call_id;
+
+ if (info->pns_call_id != pcid)
+ goto invalid;
+
+ pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+ info->cstate = PPTP_CALL_IN_CONF;
+
+ /* we expect a GRE connection from PAC to PNS */
+ exp_gre(ct, cid, pcid);
+ break;
+
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ /* server confirms disconnect */
+ cid = pptpReq->disc.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ break;
+
+ case PPTP_WAN_ERROR_NOTIFY:
+ case PPTP_SET_LINK_INFO:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+
+ default:
+ goto invalid;
+ }
+
+ nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
+ if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
+ return nf_nat_pptp_inbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
+ return NF_ACCEPT;
+
+invalid:
+ pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+ "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
+ ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+ return NF_ACCEPT;
+}
+
+static inline int
+pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq,
+ unsigned int reqlen,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ u_int16_t msg;
+ __be16 cid = 0, pcid = 0;
+ typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
+
+ msg = ntohs(ctlh->messageType);
+ pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REQUEST:
+ /* client requests for new control session */
+ if (info->sstate != PPTP_SESSION_NONE)
+ goto invalid;
+ info->sstate = PPTP_SESSION_REQUESTED;
+ break;
+
+ case PPTP_STOP_SESSION_REQUEST:
+ /* client requests end of control session */
+ info->sstate = PPTP_SESSION_STOPREQ;
+ break;
+
+ case PPTP_OUT_CALL_REQUEST:
+ /* client initiating connection to server */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ info->cstate = PPTP_CALL_OUT_REQ;
+ /* track PNS call id */
+ cid = pptpReq->ocreq.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->pns_call_id = cid;
+ break;
+
+ case PPTP_IN_CALL_REPLY:
+ /* client answers incoming call */
+ if (info->cstate != PPTP_CALL_IN_REQ &&
+ info->cstate != PPTP_CALL_IN_REP)
+ goto invalid;
+
+ cid = pptpReq->icack.callID;
+ pcid = pptpReq->icack.peersCallID;
+ if (info->pac_call_id != pcid)
+ goto invalid;
+ pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+ ntohs(cid), ntohs(pcid));
+
+ if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
+ /* part two of the three-way handshake */
+ info->cstate = PPTP_CALL_IN_REP;
+ info->pns_call_id = cid;
+ } else
+ info->cstate = PPTP_CALL_NONE;
+ break;
+
+ case PPTP_CALL_CLEAR_REQUEST:
+ /* client requests hangup of call */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ /* FUTURE: iterate over all calls and check if
+ * call ID is valid. We don't do this without newnat,
+ * because we only know about last call */
+ info->cstate = PPTP_CALL_CLEAR_REQ;
+ break;
+
+ case PPTP_SET_LINK_INFO:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+
+ default:
+ goto invalid;
+ }
+
+ nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
+ if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
+ return nf_nat_pptp_outbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
+ return NF_ACCEPT;
+
+invalid:
+ pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+ "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
+ ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+ return NF_ACCEPT;
+}
+
+static const unsigned int pptp_msg_size[] = {
+ [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest),
+ [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply),
+ [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest),
+ [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply),
+ [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest),
+ [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply),
+ [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest),
+ [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply),
+ [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected),
+ [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest),
+ [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify),
+ [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify),
+ [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo),
+};
+
+/* track caller id inside control connection, call expect_related */
+static int
+conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+
+{
+ int dir = CTINFO2DIR(ctinfo);
+ const struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct tcphdr *tcph;
+ struct tcphdr _tcph;
+ const struct pptp_pkt_hdr *pptph;
+ struct pptp_pkt_hdr _pptph;
+ struct PptpControlHeader _ctlh, *ctlh;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ unsigned int tcplen = skb->len - protoff;
+ unsigned int datalen, reqlen, nexthdr_off;
+ int oldsstate, oldcstate;
+ int ret;
+ u_int16_t msg;
+
+ /* don't do any tracking before tcp handshake complete */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ nexthdr_off = protoff;
+ tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
+ BUG_ON(!tcph);
+ nexthdr_off += tcph->doff * 4;
+ datalen = tcplen - tcph->doff * 4;
+
+ pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph);
+ if (!pptph) {
+ pr_debug("no full PPTP header, can't track\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_pptph);
+ datalen -= sizeof(_pptph);
+
+ /* if it's not a control message we can't do anything with it */
+ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
+ ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
+ pr_debug("not a control packet\n");
+ return NF_ACCEPT;
+ }
+
+ ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh)
+ return NF_ACCEPT;
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ msg = ntohs(ctlh->messageType);
+ if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg])
+ return NF_ACCEPT;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+
+ pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq)
+ return NF_ACCEPT;
+
+ oldsstate = info->sstate;
+ oldcstate = info->cstate;
+
+ spin_lock_bh(&nf_pptp_lock);
+
+ /* FIXME: We just blindly assume that the control connection is always
+ * established from PNS->PAC. However, RFC makes no guarantee */
+ if (dir == IP_CT_DIR_ORIGINAL)
+ /* client -> server (PNS -> PAC) */
+ ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
+ ctinfo);
+ else
+ /* server -> client (PAC -> PNS) */
+ ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
+ ctinfo);
+ pr_debug("sstate: %d->%d, cstate: %d->%d\n",
+ oldsstate, info->sstate, oldcstate, info->cstate);
+ spin_unlock_bh(&nf_pptp_lock);
+
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy pptp_exp_policy = {
+ .max_expected = 2,
+ .timeout = 5 * 60,
+};
+
+/* control protocol helper */
+static struct nf_conntrack_helper pptp __read_mostly = {
+ .name = "pptp",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_pptp_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = conntrack_pptp_help,
+ .destroy = pptp_destroy_siblings,
+ .expect_policy = &pptp_exp_policy,
+};
+
+static int __init nf_conntrack_pptp_init(void)
+{
+ return nf_conntrack_helper_register(&pptp);
+}
+
+static void __exit nf_conntrack_pptp_fini(void)
+{
+ nf_conntrack_helper_unregister(&pptp);
+}
+
+module_init(nf_conntrack_pptp_init);
+module_exit(nf_conntrack_pptp_fini);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
new file mode 100644
index 00000000000..b65d5864b6d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -0,0 +1,523 @@
+/* L3/L4 protocol support for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_l3protos);
+
+static DEFINE_MUTEX(nf_ct_proto_mutex);
+
+#ifdef CONFIG_SYSCTL
+static int
+nf_ct_register_sysctl(struct net *net,
+ struct ctl_table_header **header,
+ const char *path,
+ struct ctl_table *table)
+{
+ if (*header == NULL) {
+ *header = register_net_sysctl(net, path, table);
+ if (*header == NULL)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void
+nf_ct_unregister_sysctl(struct ctl_table_header **header,
+ struct ctl_table **table,
+ unsigned int users)
+{
+ if (users > 0)
+ return;
+
+ unregister_net_sysctl_table(*header);
+ kfree(*table);
+ *header = NULL;
+ *table = NULL;
+}
+#endif
+
+struct nf_conntrack_l4proto *
+__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+{
+ if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
+ return &nf_conntrack_l4proto_generic;
+
+ return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
+}
+EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct nf_conntrack_l3proto *
+nf_ct_l3proto_find_get(u_int16_t l3proto)
+{
+ struct nf_conntrack_l3proto *p;
+
+ rcu_read_lock();
+ p = __nf_ct_l3proto_find(l3proto);
+ if (!try_module_get(p->me))
+ p = &nf_conntrack_l3proto_generic;
+ rcu_read_unlock();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
+
+int
+nf_ct_l3proto_try_module_get(unsigned short l3proto)
+{
+ int ret;
+ struct nf_conntrack_l3proto *p;
+
+retry: p = nf_ct_l3proto_find_get(l3proto);
+ if (p == &nf_conntrack_l3proto_generic) {
+ ret = request_module("nf_conntrack-%d", l3proto);
+ if (!ret)
+ goto retry;
+
+ return -EPROTOTYPE;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
+
+void nf_ct_l3proto_module_put(unsigned short l3proto)
+{
+ struct nf_conntrack_l3proto *p;
+
+ /* rcu_read_lock not necessary since the caller holds a reference, but
+ * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
+ */
+ rcu_read_lock();
+ p = __nf_ct_l3proto_find(l3proto);
+ module_put(p->me);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+
+struct nf_conntrack_l4proto *
+nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
+{
+ struct nf_conntrack_l4proto *p;
+
+ rcu_read_lock();
+ p = __nf_ct_l4proto_find(l3num, l4num);
+ if (!try_module_get(p->me))
+ p = &nf_conntrack_l4proto_generic;
+ rcu_read_unlock();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
+
+void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p)
+{
+ module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
+
+static int kill_l3proto(struct nf_conn *i, void *data)
+{
+ return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
+}
+
+static int kill_l4proto(struct nf_conn *i, void *data)
+{
+ struct nf_conntrack_l4proto *l4proto;
+ l4proto = (struct nf_conntrack_l4proto *)data;
+ return nf_ct_protonum(i) == l4proto->l4proto &&
+ nf_ct_l3num(i) == l4proto->l3proto;
+}
+
+static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ if (l3proto->l3proto == PF_INET)
+ return &net->ct.nf_ct_proto;
+ else
+ return NULL;
+}
+
+static int nf_ct_l3proto_register_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ int err = 0;
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+ /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
+ if (in == NULL)
+ return 0;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &in->ctl_table_header,
+ l3proto->ctl_table_path,
+ in->ctl_table);
+ if (err < 0) {
+ kfree(in->ctl_table);
+ in->ctl_table = NULL;
+ }
+ }
+#endif
+ return err;
+}
+
+static void nf_ct_l3proto_unregister_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+
+ if (in == NULL)
+ return;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&in->ctl_table_header,
+ &in->ctl_table,
+ 0);
+#endif
+}
+
+int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
+{
+ int ret = 0;
+ struct nf_conntrack_l3proto *old;
+
+ if (proto->l3proto >= AF_MAX)
+ return -EBUSY;
+
+ if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size)
+ return -EINVAL;
+
+ mutex_lock(&nf_ct_proto_mutex);
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (proto->nlattr_tuple_size)
+ proto->nla_size = 3 * proto->nlattr_tuple_size();
+
+ rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
+
+out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
+
+int nf_ct_l3proto_pernet_register(struct net *net,
+ struct nf_conntrack_l3proto *proto)
+{
+ int ret = 0;
+
+ if (proto->init_net) {
+ ret = proto->init_net(net);
+ if (ret < 0)
+ return ret;
+ }
+
+ return nf_ct_l3proto_register_sysctl(net, proto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+
+void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+{
+ BUG_ON(proto->l3proto >= AF_MAX);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
+ rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
+ &nf_conntrack_l3proto_generic);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
+
+void nf_ct_l3proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l3proto *proto)
+{
+ nf_ct_l3proto_unregister_sysctl(net, proto);
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister);
+
+static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ if (l4proto->get_net_proto) {
+ /* statically built-in protocols use static per-net */
+ return l4proto->get_net_proto(net);
+ } else if (l4proto->net_id) {
+ /* ... and loadable protocols use dynamic per-net */
+ return net_generic(net, *l4proto->net_id);
+ }
+ return NULL;
+}
+
+static
+int nf_ct_l4proto_register_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ int err = 0;
+
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_table_header,
+ "net/netfilter",
+ pn->ctl_table);
+ if (err < 0) {
+ if (!pn->users) {
+ kfree(pn->ctl_table);
+ pn->ctl_table = NULL;
+ }
+ }
+ }
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
+ if (err < 0) {
+ nf_ct_kfree_compat_sysctl_table(pn);
+ goto out;
+ }
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_compat_header,
+ "net/ipv4/netfilter",
+ pn->ctl_compat_table);
+ if (err == 0)
+ goto out;
+
+ nf_ct_kfree_compat_sysctl_table(pn);
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
+ }
+out:
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+ return err;
+}
+
+static
+void nf_ct_l4proto_unregister_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_compat_header,
+ &pn->ctl_compat_table,
+ 0);
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+ them. --RR */
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+{
+ int ret = 0;
+
+ if (l4proto->l3proto >= PF_MAX)
+ return -EBUSY;
+
+ if ((l4proto->to_nlattr && !l4proto->nlattr_size)
+ || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
+ return -EINVAL;
+
+ mutex_lock(&nf_ct_proto_mutex);
+ if (!nf_ct_protos[l4proto->l3proto]) {
+ /* l3proto may be loaded latter. */
+ struct nf_conntrack_l4proto __rcu **proto_array;
+ int i;
+
+ proto_array = kmalloc(MAX_NF_CT_PROTO *
+ sizeof(struct nf_conntrack_l4proto *),
+ GFP_KERNEL);
+ if (proto_array == NULL) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < MAX_NF_CT_PROTO; i++)
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
+
+ /* Before making proto_array visible to lockless readers,
+ * we must make sure its content is committed to memory.
+ */
+ smp_wmb();
+
+ nf_ct_protos[l4proto->l3proto] = proto_array;
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ l4proto->nla_size = 0;
+ if (l4proto->nlattr_size)
+ l4proto->nla_size += l4proto->nlattr_size();
+ if (l4proto->nlattr_tuple_size)
+ l4proto->nla_size += 3 * l4proto->nlattr_tuple_size();
+
+ rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ l4proto);
+out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+
+int nf_ct_l4proto_pernet_register(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ int ret = 0;
+ struct nf_proto_net *pn = NULL;
+
+ if (l4proto->init_net) {
+ ret = l4proto->init_net(net, l4proto->l3proto);
+ if (ret < 0)
+ goto out;
+ }
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ goto out;
+
+ ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto);
+ if (ret < 0)
+ goto out;
+
+ pn->users++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+{
+ BUG_ON(l4proto->l3proto >= PF_MAX);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
+ rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ &nf_conntrack_l4proto_generic);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+
+void nf_ct_l4proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nf_proto_net *pn = NULL;
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ return;
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
+
+int nf_conntrack_proto_pernet_init(struct net *net)
+{
+ int err;
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
+
+ err = nf_conntrack_l4proto_generic.init_net(net,
+ nf_conntrack_l4proto_generic.l3proto);
+ if (err < 0)
+ return err;
+ err = nf_ct_l4proto_register_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+ if (err < 0)
+ return err;
+
+ pn->users++;
+ return 0;
+}
+
+void nf_conntrack_proto_pernet_fini(struct net *net)
+{
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+}
+
+int nf_conntrack_proto_init(void)
+{
+ unsigned int i;
+ for (i = 0; i < AF_MAX; i++)
+ rcu_assign_pointer(nf_ct_l3protos[i],
+ &nf_conntrack_l3proto_generic);
+ return 0;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+ unsigned int i;
+ /* free l3proto protocol tables */
+ for (i = 0; i < PF_MAX; i++)
+ kfree(nf_ct_protos[i]);
+}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
new file mode 100644
index 00000000000..cb372f96f10
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -0,0 +1,1006 @@
+/*
+ * DCCP connection tracking protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+/* Timeouts are based on values from RFC4340:
+ *
+ * - REQUEST:
+ *
+ * 8.1.2. Client Request
+ *
+ * A client MAY give up on its DCCP-Requests after some time
+ * (3 minutes, for example).
+ *
+ * - RESPOND:
+ *
+ * 8.1.3. Server Response
+ *
+ * It MAY also leave the RESPOND state for CLOSED after a timeout of
+ * not less than 4MSL (8 minutes);
+ *
+ * - PARTOPEN:
+ *
+ * 8.1.5. Handshake Completion
+ *
+ * If the client remains in PARTOPEN for more than 4MSL (8 minutes),
+ * it SHOULD reset the connection with Reset Code 2, "Aborted".
+ *
+ * - OPEN:
+ *
+ * The DCCP timestamp overflows after 11.9 hours. If the connection
+ * stays idle this long the sequence number won't be recognized
+ * as valid anymore.
+ *
+ * - CLOSEREQ/CLOSING:
+ *
+ * 8.3. Termination
+ *
+ * The retransmission timer should initially be set to go off in two
+ * round-trip times and should back off to not less than once every
+ * 64 seconds ...
+ *
+ * - TIMEWAIT:
+ *
+ * 4.3. States
+ *
+ * A server or client socket remains in this state for 2MSL (4 minutes)
+ * after the connection has been town down, ...
+ */
+
+#define DCCP_MSL (2 * 60 * HZ)
+
+static const char * const dccp_state_names[] = {
+ [CT_DCCP_NONE] = "NONE",
+ [CT_DCCP_REQUEST] = "REQUEST",
+ [CT_DCCP_RESPOND] = "RESPOND",
+ [CT_DCCP_PARTOPEN] = "PARTOPEN",
+ [CT_DCCP_OPEN] = "OPEN",
+ [CT_DCCP_CLOSEREQ] = "CLOSEREQ",
+ [CT_DCCP_CLOSING] = "CLOSING",
+ [CT_DCCP_TIMEWAIT] = "TIMEWAIT",
+ [CT_DCCP_IGNORE] = "IGNORE",
+ [CT_DCCP_INVALID] = "INVALID",
+};
+
+#define sNO CT_DCCP_NONE
+#define sRQ CT_DCCP_REQUEST
+#define sRS CT_DCCP_RESPOND
+#define sPO CT_DCCP_PARTOPEN
+#define sOP CT_DCCP_OPEN
+#define sCR CT_DCCP_CLOSEREQ
+#define sCG CT_DCCP_CLOSING
+#define sTW CT_DCCP_TIMEWAIT
+#define sIG CT_DCCP_IGNORE
+#define sIV CT_DCCP_INVALID
+
+/*
+ * DCCP state transition table
+ *
+ * The assumption is the same as for TCP tracking:
+ *
+ * We are the man in the middle. All the packets go through us but might
+ * get lost in transit to the destination. It is assumed that the destination
+ * can't receive segments we haven't seen.
+ *
+ * The following states exist:
+ *
+ * NONE: Initial state, expecting Request
+ * REQUEST: Request seen, waiting for Response from server
+ * RESPOND: Response from server seen, waiting for Ack from client
+ * PARTOPEN: Ack after Response seen, waiting for packet other than Response,
+ * Reset or Sync from server
+ * OPEN: Packet other than Response, Reset or Sync seen
+ * CLOSEREQ: CloseReq from server seen, expecting Close from client
+ * CLOSING: Close seen, expecting Reset
+ * TIMEWAIT: Reset seen
+ * IGNORE: Not determinable whether packet is valid
+ *
+ * Some states exist only on one side of the connection: REQUEST, RESPOND,
+ * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
+ * the one it was in before.
+ *
+ * Packets are marked as ignored (sIG) if we don't know if they're valid
+ * (for example a reincarnation of a connection we didn't notice is dead
+ * already) and the server may send back a connection closing Reset or a
+ * Response. They're also used for Sync/SyncAck packets, which we don't
+ * care about.
+ */
+static const u_int8_t
+dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
+ [CT_DCCP_ROLE_CLIENT] = {
+ [DCCP_PKT_REQUEST] = {
+ /*
+ * sNO -> sRQ Regular Request
+ * sRQ -> sRQ Retransmitted Request or reincarnation
+ * sRS -> sRS Retransmitted Request (apparently Response
+ * got lost after we saw it) or reincarnation
+ * sPO -> sIG Ignore, conntrack might be out of sync
+ * sOP -> sIG Ignore, conntrack might be out of sync
+ * sCR -> sIG Ignore, conntrack might be out of sync
+ * sCG -> sIG Ignore, conntrack might be out of sync
+ * sTW -> sRQ Reincarnation
+ *
+ * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
+ sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
+ },
+ [DCCP_PKT_RESPONSE] = {
+ /*
+ * sNO -> sIV Invalid
+ * sRQ -> sIG Ignore, might be response to ignored Request
+ * sRS -> sIG Ignore, might be response to ignored Request
+ * sPO -> sIG Ignore, might be response to ignored Request
+ * sOP -> sIG Ignore, might be response to ignored Request
+ * sCR -> sIG Ignore, might be response to ignored Request
+ * sCG -> sIG Ignore, might be response to ignored Request
+ * sTW -> sIV Invalid, reincarnation in reverse direction
+ * goes through sRQ
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
+ },
+ [DCCP_PKT_ACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
+ * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN
+ * sOP -> sOP Regular ACK, remain in OPEN
+ * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+ },
+ [DCCP_PKT_DATA] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.)
+ * sOP -> sOP Regular Data packet
+ * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
+ },
+ [DCCP_PKT_DATAACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
+ * sPO -> sPO Remain in PARTOPEN state
+ * sOP -> sOP Regular DataAck packet in OPEN state
+ * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+ },
+ [DCCP_PKT_CLOSEREQ] = {
+ /*
+ * CLOSEREQ may only be sent by the server.
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
+ },
+ [DCCP_PKT_CLOSE] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sCG Client-initiated close
+ * sOP -> sCG Client-initiated close
+ * sCR -> sCG Close in response to CloseReq (8.3.)
+ * sCG -> sCG Retransmit
+ * sTW -> sIV Late retransmit, already in TIME_WAIT
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
+ },
+ [DCCP_PKT_RESET] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.)
+ * sRS -> sTW Response received without Request
+ * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.)
+ * sOP -> sTW Connection reset
+ * sCR -> sTW Connection reset
+ * sCG -> sTW Connection reset
+ * sTW -> sIG Ignore (don't refresh timer)
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+ },
+ [DCCP_PKT_SYNC] = {
+ /*
+ * We currently ignore Sync packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ [DCCP_PKT_SYNCACK] = {
+ /*
+ * We currently ignore SyncAck packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ },
+ [CT_DCCP_ROLE_SERVER] = {
+ [DCCP_PKT_REQUEST] = {
+ /*
+ * sNO -> sIV Invalid
+ * sRQ -> sIG Ignore, conntrack might be out of sync
+ * sRS -> sIG Ignore, conntrack might be out of sync
+ * sPO -> sIG Ignore, conntrack might be out of sync
+ * sOP -> sIG Ignore, conntrack might be out of sync
+ * sCR -> sIG Ignore, conntrack might be out of sync
+ * sCG -> sIG Ignore, conntrack might be out of sync
+ * sTW -> sRQ Reincarnation, must reverse roles
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
+ },
+ [DCCP_PKT_RESPONSE] = {
+ /*
+ * sNO -> sIV Response without Request
+ * sRQ -> sRS Response to clients Request
+ * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT)
+ * sPO -> sIG Response to an ignored Request or late retransmit
+ * sOP -> sIG Ignore, might be response to ignored Request
+ * sCR -> sIG Ignore, might be response to ignored Request
+ * sCG -> sIG Ignore, might be response to ignored Request
+ * sTW -> sIV Invalid, Request from client in sTW moves to sRQ
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
+ },
+ [DCCP_PKT_ACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular Ack in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_DATA] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular Data packet in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_DATAACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular DataAck in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_CLOSEREQ] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.)
+ * sOP -> sCR CloseReq in OPEN state
+ * sCR -> sCR Retransmit
+ * sCG -> sCR Simultaneous close, client sends another Close
+ * sTW -> sIV Already closed
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
+ },
+ [DCCP_PKT_CLOSE] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP -> sCG Move direcly to CLOSING
+ * sOP -> sCG Move to CLOSING
+ * sCR -> sIV Close after CloseReq is invalid
+ * sCG -> sCG Retransmit
+ * sTW -> sIV Already closed
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
+ },
+ [DCCP_PKT_RESET] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sTW Reset in response to Request
+ * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.)
+ * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.)
+ * sOP -> sTW
+ * sCR -> sTW
+ * sCG -> sTW
+ * sTW -> sIG Ignore (don't refresh timer)
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
+ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+ },
+ [DCCP_PKT_SYNC] = {
+ /*
+ * We currently ignore Sync packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ [DCCP_PKT_SYNCACK] = {
+ /*
+ * We currently ignore SyncAck packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ },
+};
+
+/* this module per-net specifics */
+static int dccp_net_id __read_mostly;
+struct dccp_net {
+ struct nf_proto_net pn;
+ int dccp_loose;
+ unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+};
+
+static inline struct dccp_net *dccp_pernet(struct net *net)
+{
+ return net_generic(net, dccp_net_id);
+}
+
+static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct dccp_hdr _hdr, *dh;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (dh == NULL)
+ return false;
+
+ tuple->src.u.dccp.port = dh->dccph_sport;
+ tuple->dst.u.dccp.port = dh->dccph_dport;
+ return true;
+}
+
+static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
+ const struct nf_conntrack_tuple *tuple)
+{
+ inv->src.u.dccp.port = tuple->dst.u.dccp.port;
+ inv->dst.u.dccp.port = tuple->src.u.dccp.port;
+ return true;
+}
+
+static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ struct net *net = nf_ct_net(ct);
+ struct dccp_net *dn;
+ struct dccp_hdr _dh, *dh;
+ const char *msg;
+ u_int8_t state;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ BUG_ON(dh == NULL);
+
+ state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
+ switch (state) {
+ default:
+ dn = dccp_pernet(net);
+ if (dn->dccp_loose == 0) {
+ msg = "nf_ct_dccp: not picking up existing connection ";
+ goto out_invalid;
+ }
+ case CT_DCCP_REQUEST:
+ break;
+ case CT_DCCP_INVALID:
+ msg = "nf_ct_dccp: invalid state transition ";
+ goto out_invalid;
+ }
+
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.state = CT_DCCP_NONE;
+ ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+ ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+ ct->proto.dccp.handshake_seq = 0;
+ return true;
+
+out_invalid:
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+ NULL, "%s", msg);
+ return false;
+}
+
+static u64 dccp_ack_seq(const struct dccp_hdr *dh)
+{
+ const struct dccp_hdr_ack_bits *dhack;
+
+ dhack = (void *)dh + __dccp_basic_hdr_len(dh);
+ return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
+ ntohl(dhack->dccph_ack_nr_low);
+}
+
+static unsigned int *dccp_get_timeouts(struct net *net)
+{
+ return dccp_pernet(net)->dccp_timeout;
+}
+
+static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, enum ip_conntrack_info ctinfo,
+ u_int8_t pf, unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ struct net *net = nf_ct_net(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct dccp_hdr _dh, *dh;
+ u_int8_t type, old_state, new_state;
+ enum ct_dccp_roles role;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ BUG_ON(dh == NULL);
+ type = dh->dccph_type;
+
+ if (type == DCCP_PKT_RESET &&
+ !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ /* Tear down connection immediately if only reply is a RESET */
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_ACCEPT;
+ }
+
+ spin_lock_bh(&ct->lock);
+
+ role = ct->proto.dccp.role[dir];
+ old_state = ct->proto.dccp.state;
+ new_state = dccp_state_table[role][type][old_state];
+
+ switch (new_state) {
+ case CT_DCCP_REQUEST:
+ if (old_state == CT_DCCP_TIMEWAIT &&
+ role == CT_DCCP_ROLE_SERVER) {
+ /* Reincarnation in the reverse direction: reopen and
+ * reverse client/server roles. */
+ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
+ }
+ break;
+ case CT_DCCP_RESPOND:
+ if (old_state == CT_DCCP_REQUEST)
+ ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+ break;
+ case CT_DCCP_PARTOPEN:
+ if (old_state == CT_DCCP_RESPOND &&
+ type == DCCP_PKT_ACK &&
+ dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ break;
+ case CT_DCCP_IGNORE:
+ /*
+ * Connection tracking might be out of sync, so we ignore
+ * packets that might establish a new connection and resync
+ * if the server responds with a valid Response.
+ */
+ if (ct->proto.dccp.last_dir == !dir &&
+ ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
+ type == DCCP_PKT_RESPONSE) {
+ ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+ new_state = CT_DCCP_RESPOND;
+ break;
+ }
+ ct->proto.dccp.last_dir = dir;
+ ct->proto.dccp.last_pkt = type;
+
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_dccp: invalid packet ignored ");
+ return NF_ACCEPT;
+ case CT_DCCP_INVALID:
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_dccp: invalid state transition ");
+ return -NF_ACCEPT;
+ }
+
+ ct->proto.dccp.last_dir = dir;
+ ct->proto.dccp.last_pkt = type;
+ ct->proto.dccp.state = new_state;
+ spin_unlock_bh(&ct->lock);
+
+ if (new_state != old_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
+
+ return NF_ACCEPT;
+}
+
+static int dccp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ u_int8_t pf, unsigned int hooknum)
+{
+ struct dccp_hdr _dh, *dh;
+ unsigned int dccp_len = skb->len - dataoff;
+ unsigned int cscov;
+ const char *msg;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ msg = "nf_ct_dccp: short packet ";
+ goto out_invalid;
+ }
+
+ if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
+ dh->dccph_doff * 4 > dccp_len) {
+ msg = "nf_ct_dccp: truncated/malformed packet ";
+ goto out_invalid;
+ }
+
+ cscov = dccp_len;
+ if (dh->dccph_cscov) {
+ cscov = (dh->dccph_cscov - 1) * 4;
+ if (cscov > dccp_len) {
+ msg = "nf_ct_dccp: bad checksum coverage ";
+ goto out_invalid;
+ }
+ }
+
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
+ pf)) {
+ msg = "nf_ct_dccp: bad checksum ";
+ goto out_invalid;
+ }
+
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ msg = "nf_ct_dccp: reserved packet type ";
+ goto out_invalid;
+ }
+
+ return NF_ACCEPT;
+
+out_invalid:
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);
+ return -NF_ACCEPT;
+}
+
+static int dccp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.dccp.port),
+ ntohs(tuple->dst.u.dccp.port));
+}
+
+static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
+ cpu_to_be64(ct->proto.dccp.handshake_seq)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+ spin_unlock_bh(&ct->lock);
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
+ return -1;
+}
+
+static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
+ [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
+};
+
+static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
+{
+ struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
+ struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
+ int err;
+
+ if (!attr)
+ return 0;
+
+ err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr,
+ dccp_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
+ !tb[CTA_PROTOINFO_DCCP_ROLE] ||
+ nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
+ nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&ct->lock);
+ ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
+ if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+ } else {
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
+ }
+ if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
+ ct->proto.dccp.handshake_seq =
+ be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
+ }
+ spin_unlock_bh(&ct->lock);
+ return 0;
+}
+
+static int dccp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_DCCP */
+ + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);
+}
+
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ unsigned int *timeouts = data;
+ int i;
+
+ /* set default DCCP timeouts. */
+ for (i=0; i<CT_DCCP_MAX; i++)
+ timeouts[i] = dn->dccp_timeout[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
+ [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+/* template, data assigned later */
+static struct ctl_table dccp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_dccp_timeout_request",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_respond",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_partopen",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_open",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_closereq",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_closing",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_timewait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_loose",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
+ struct dccp_net *dn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(dccp_sysctl_table,
+ sizeof(dccp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
+ pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
+ pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
+ pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
+ pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
+ pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
+ pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
+ pn->ctl_table[7].data = &dn->dccp_loose;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ pn->ctl_table[0].procname = NULL;
+#endif
+ return 0;
+}
+
+static int dccp_init_net(struct net *net, u_int16_t proto)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ struct nf_proto_net *pn = &dn->pn;
+
+ if (!pn->users) {
+ /* default values */
+ dn->dccp_loose = 1;
+ dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
+ }
+
+ return dccp_kmemdup_sysctl_table(net, pn, dn);
+}
+
+static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+ .l3proto = AF_INET,
+ .l4proto = IPPROTO_DCCP,
+ .name = "dccp",
+ .pkt_to_tuple = dccp_pkt_to_tuple,
+ .invert_tuple = dccp_invert_tuple,
+ .new = dccp_new,
+ .packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
+ .error = dccp_error,
+ .print_tuple = dccp_print_tuple,
+ .print_conntrack = dccp_print_conntrack,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = dccp_to_nlattr,
+ .nlattr_size = dccp_nlattr_size,
+ .from_nlattr = nlattr_to_dccp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
+};
+
+static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+ .l3proto = AF_INET6,
+ .l4proto = IPPROTO_DCCP,
+ .name = "dccp",
+ .pkt_to_tuple = dccp_pkt_to_tuple,
+ .invert_tuple = dccp_invert_tuple,
+ .new = dccp_new,
+ .packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
+ .error = dccp_error,
+ .print_tuple = dccp_print_tuple,
+ .print_conntrack = dccp_print_conntrack,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = dccp_to_nlattr,
+ .nlattr_size = dccp_nlattr_size,
+ .from_nlattr = nlattr_to_dccp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
+};
+
+static __net_init int dccp_net_init(struct net *net)
+{
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
+ goto cleanup_dccp4;
+ }
+ return 0;
+cleanup_dccp4:
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
+out:
+ return ret;
+}
+
+static __net_exit void dccp_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
+}
+
+static struct pernet_operations dccp_net_ops = {
+ .init = dccp_net_init,
+ .exit = dccp_net_exit,
+ .id = &dccp_net_id,
+ .size = sizeof(struct dccp_net),
+};
+
+static int __init nf_conntrack_proto_dccp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&dccp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&dccp_proto4);
+ if (ret < 0)
+ goto out_dccp4;
+
+ ret = nf_ct_l4proto_register(&dccp_proto6);
+ if (ret < 0)
+ goto out_dccp6;
+
+ return 0;
+out_dccp6:
+ nf_ct_l4proto_unregister(&dccp_proto4);
+out_dccp4:
+ unregister_pernet_subsys(&dccp_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_conntrack_proto_dccp_fini(void)
+{
+ nf_ct_l4proto_unregister(&dccp_proto6);
+ nf_ct_l4proto_unregister(&dccp_proto4);
+ unregister_pernet_subsys(&dccp_net_ops);
+}
+
+module_init(nf_conntrack_proto_dccp_init);
+module_exit(nf_conntrack_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 46bc27e2756..d25f2937764 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -4,38 +4,38 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - enable working with L3 protocol independent connection tracking.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c
*/
#include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+
+static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
-unsigned int nf_ct_generic_timeout = 600*HZ;
+static inline struct nf_generic_net *generic_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.generic;
+}
-static int generic_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+static bool generic_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
{
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
- return 1;
+ return true;
}
-static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
- return 1;
+ return true;
}
/* Print out the per-protocol part of the tuple. */
@@ -45,41 +45,172 @@ static int generic_print_tuple(struct seq_file *s,
return 0;
}
-/* Print out the private part of the conntrack. */
-static int generic_print_conntrack(struct seq_file *s,
- const struct nf_conn *state)
+static unsigned int *generic_get_timeouts(struct net *net)
{
- return 0;
+ return &(generic_pernet(net)->timeout);
}
/* Returns verdict for packet, or -1 for invalid. */
-static int packet(struct nf_conn *conntrack,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
+static int generic_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
{
- nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout);
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
-static int new(struct nf_conn *conntrack, const struct sk_buff *skb,
- unsigned int dataoff)
+static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ return true;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_generic_net *gn = generic_pernet(net);
+
+ if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ;
+ else {
+ /* Set default generic timeout. */
+ *timeout = gn->timeout;
+ }
+
+ return 0;
+}
+
+static int
+generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
+ [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table generic_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_generic_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table generic_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_generic_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(generic_sysctl_table,
+ sizeof(generic_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &gn->timeout;
+#endif
+ return 0;
+}
+
+static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
+ sizeof(generic_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &gn->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int generic_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_generic_net *gn = generic_pernet(net);
+ struct nf_proto_net *pn = &gn->pn;
+
+ gn->timeout = nf_ct_generic_timeout;
+
+ ret = generic_kmemdup_compat_sysctl_table(pn, gn);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_kmemdup_sysctl_table(pn, gn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *generic_get_net_proto(struct net *net)
{
- return 1;
+ return &net->ct.nf_ct_proto.generic.pn;
}
-struct nf_conntrack_protocol nf_conntrack_generic_protocol =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
{
.l3proto = PF_UNSPEC,
- .proto = 0,
+ .l4proto = 255,
.name = "unknown",
.pkt_to_tuple = generic_pkt_to_tuple,
.invert_tuple = generic_invert_tuple,
.print_tuple = generic_print_tuple,
- .print_conntrack = generic_print_conntrack,
- .packet = packet,
- .new = new,
+ .packet = generic_packet,
+ .get_timeouts = generic_get_timeouts,
+ .new = generic_new,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = generic_timeout_nlattr_to_obj,
+ .obj_to_nlattr = generic_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GENERIC_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = generic_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = generic_init_net,
+ .get_net_proto = generic_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
new file mode 100644
index 00000000000..d5665739e3b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -0,0 +1,447 @@
+/*
+ * ip_conntrack_proto_gre.c - Version 3.0
+ *
+ * Connection tracking protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+enum grep_conntrack {
+ GRE_CT_UNREPLIED,
+ GRE_CT_REPLIED,
+ GRE_CT_MAX
+};
+
+static unsigned int gre_timeouts[GRE_CT_MAX] = {
+ [GRE_CT_UNREPLIED] = 30*HZ,
+ [GRE_CT_REPLIED] = 180*HZ,
+};
+
+static int proto_gre_net_id __read_mostly;
+struct netns_proto_gre {
+ struct nf_proto_net nf;
+ rwlock_t keymap_lock;
+ struct list_head keymap_list;
+ unsigned int gre_timeouts[GRE_CT_MAX];
+};
+
+static inline struct netns_proto_gre *gre_pernet(struct net *net)
+{
+ return net_generic(net, proto_gre_net_id);
+}
+
+static void nf_ct_gre_keymap_flush(struct net *net)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_gre_keymap *km, *tmp;
+
+ write_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
+ list_del(&km->list);
+ kfree(km);
+ }
+ write_unlock_bh(&net_gre->keymap_lock);
+}
+
+static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
+ const struct nf_conntrack_tuple *t)
+{
+ return km->tuple.src.l3num == t->src.l3num &&
+ !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) &&
+ !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) &&
+ km->tuple.dst.protonum == t->dst.protonum &&
+ km->tuple.dst.u.all == t->dst.u.all;
+}
+
+/* look up the source key for a given tuple */
+static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_gre_keymap *km;
+ __be16 key = 0;
+
+ read_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry(km, &net_gre->keymap_list, list) {
+ if (gre_key_cmpfn(km, t)) {
+ key = km->tuple.src.u.gre.key;
+ break;
+ }
+ }
+ read_unlock_bh(&net_gre->keymap_lock);
+
+ pr_debug("lookup src key 0x%x for ", key);
+ nf_ct_dump_tuple(t);
+
+ return key;
+}
+
+/* add a single keymap entry, associate with specified master ct */
+int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ struct nf_conntrack_tuple *t)
+{
+ struct net *net = nf_ct_net(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ struct nf_ct_gre_keymap **kmp, *km;
+
+ kmp = &ct_pptp_info->keymap[dir];
+ if (*kmp) {
+ /* check whether it's a retransmission */
+ read_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry(km, &net_gre->keymap_list, list) {
+ if (gre_key_cmpfn(km, t) && km == *kmp) {
+ read_unlock_bh(&net_gre->keymap_lock);
+ return 0;
+ }
+ }
+ read_unlock_bh(&net_gre->keymap_lock);
+ pr_debug("trying to override keymap_%s for ct %p\n",
+ dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
+ return -EEXIST;
+ }
+
+ km = kmalloc(sizeof(*km), GFP_ATOMIC);
+ if (!km)
+ return -ENOMEM;
+ memcpy(&km->tuple, t, sizeof(*t));
+ *kmp = km;
+
+ pr_debug("adding new entry %p: ", km);
+ nf_ct_dump_tuple(&km->tuple);
+
+ write_lock_bh(&net_gre->keymap_lock);
+ list_add_tail(&km->list, &net_gre->keymap_list);
+ write_unlock_bh(&net_gre->keymap_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
+
+/* destroy the keymap entries associated with specified master ct */
+void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir;
+
+ pr_debug("entering for ct %p\n", ct);
+
+ write_lock_bh(&net_gre->keymap_lock);
+ for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+ if (ct_pptp_info->keymap[dir]) {
+ pr_debug("removing %p from list\n",
+ ct_pptp_info->keymap[dir]);
+ list_del(&ct_pptp_info->keymap[dir]->list);
+ kfree(ct_pptp_info->keymap[dir]);
+ ct_pptp_info->keymap[dir] = NULL;
+ }
+ }
+ write_unlock_bh(&net_gre->keymap_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
+
+/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
+
+/* invert gre part of tuple */
+static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->dst.u.gre.key = orig->src.u.gre.key;
+ tuple->src.u.gre.key = orig->dst.u.gre.key;
+ return true;
+}
+
+/* gre hdr info to tuple */
+static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
+ const struct gre_hdr_pptp *pgrehdr;
+ struct gre_hdr_pptp _pgrehdr;
+ __be16 srckey;
+ const struct gre_hdr *grehdr;
+ struct gre_hdr _grehdr;
+
+ /* first only delinearize old RFC1701 GRE header */
+ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
+ if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ /* try to behave like "nf_conntrack_proto_generic" */
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+ return true;
+ }
+
+ /* PPTP header is variable length, only need up to the call_id field */
+ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+ if (!pgrehdr)
+ return true;
+
+ if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+ pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+ return false;
+ }
+
+ tuple->dst.u.gre.key = pgrehdr->call_id;
+ srckey = gre_keymap_lookup(net, tuple);
+ tuple->src.u.gre.key = srckey;
+
+ return true;
+}
+
+/* print gre part of tuple */
+static int gre_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+ ntohs(tuple->src.u.gre.key),
+ ntohs(tuple->dst.u.gre.key));
+}
+
+/* print private data for conntrack */
+static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ return seq_printf(s, "timeout=%u, stream_timeout=%u ",
+ (ct->proto.gre.timeout / HZ),
+ (ct->proto.gre.stream_timeout / HZ));
+}
+
+static unsigned int *gre_get_timeouts(struct net *net)
+{
+ return gre_pernet(net)->gre_timeouts;
+}
+
+/* Returns verdict for packet, and may modify conntrack */
+static int gre_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ /* If we've seen traffic both ways, this is a GRE connection.
+ * Extend timeout. */
+ if (ct->status & IPS_SEEN_REPLY) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ ct->proto.gre.stream_timeout);
+ /* Also, more likely to be important, and not a probe. */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ ct->proto.gre.timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ pr_debug(": ");
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+
+ /* initialize to sane value. Ideally a conntrack helper
+ * (e.g. in case of pptp) is increasing them */
+ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
+ ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
+
+ return true;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory */
+static void gre_destroy(struct nf_conn *ct)
+{
+ struct nf_conn *master = ct->master;
+ pr_debug(" entering\n");
+
+ if (!master)
+ pr_debug("no master !?!\n");
+ else
+ nf_ct_gre_keymap_destroy(master);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+
+ /* set default timeouts for GRE. */
+ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
+ timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
+ timeouts[GRE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_GRE_REPLIED]) {
+ timeouts[GRE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED,
+ htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED,
+ htonl(timeouts[GRE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
+ [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+static int gre_init_net(struct net *net, u_int16_t proto)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ int i;
+
+ rwlock_init(&net_gre->keymap_lock);
+ INIT_LIST_HEAD(&net_gre->keymap_list);
+ for (i = 0; i < GRE_CT_MAX; i++)
+ net_gre->gre_timeouts[i] = gre_timeouts[i];
+
+ return 0;
+}
+
+/* protocol helper struct */
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
+ .l3proto = AF_INET,
+ .l4proto = IPPROTO_GRE,
+ .name = "gre",
+ .pkt_to_tuple = gre_pkt_to_tuple,
+ .invert_tuple = gre_invert_tuple,
+ .print_tuple = gre_print_tuple,
+ .print_conntrack = gre_print_conntrack,
+ .get_timeouts = gre_get_timeouts,
+ .packet = gre_packet,
+ .new = gre_new,
+ .destroy = gre_destroy,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = gre_timeout_nlattr_to_obj,
+ .obj_to_nlattr = gre_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GRE_MAX,
+ .obj_size = sizeof(unsigned int) * GRE_CT_MAX,
+ .nla_policy = gre_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &proto_gre_net_id,
+ .init_net = gre_init_net,
+};
+
+static int proto_gre_net_init(struct net *net)
+{
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ pr_err("nf_conntrack_gre4: pernet registration failed.\n");
+ return ret;
+}
+
+static void proto_gre_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
+ nf_ct_gre_keymap_flush(net);
+}
+
+static struct pernet_operations proto_gre_net_ops = {
+ .init = proto_gre_net_init,
+ .exit = proto_gre_net_exit,
+ .id = &proto_gre_net_id,
+ .size = sizeof(struct netns_proto_gre),
+};
+
+static int __init nf_ct_proto_gre_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&proto_gre_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ goto out_gre4;
+
+ return 0;
+out_gre4:
+ unregister_pernet_subsys(&proto_gre_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_ct_proto_gre_fini(void)
+{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+ unregister_pernet_subsys(&proto_gre_net_ops);
+}
+
+module_init(nf_ct_proto_gre_init);
+module_exit(nf_ct_proto_gre_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index cf798e61e37..1314d33f6bc 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -1,25 +1,18 @@
/*
* Connection tracking protocol helper module for SCTP.
- *
- * SCTP is defined in RFC 2960. References to various sections in this code
+ *
+ * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
+ * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * SCTP is defined in RFC 2960. References to various sections in this code
* are to this RFC.
- *
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - enable working with L3 protocol independent connection tracking.
- *
- * Derived from net/ipv4/ip_conntrack_sctp.c
- */
-
-/*
- * Added support for proc manipulation of timeouts.
*/
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <linux/module.h>
@@ -28,25 +21,19 @@
#include <linux/sctp.h>
#include <linux/string.h>
#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
-
-#if 0
-#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Protects conntrack->proto.sctp */
-static DEFINE_RWLOCK(sctp_lock);
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR
+ closely. They're more complex. --RR
And so for me for SCTP :D -Kiran */
-static const char *sctp_conntrack_names[] = {
+static const char *const sctp_conntrack_names[] = {
"NONE",
"CLOSED",
"COOKIE_WAIT",
@@ -62,24 +49,15 @@ static const char *sctp_conntrack_names[] = {
#define HOURS * 60 MINS
#define DAYS * 24 HOURS
-static unsigned int nf_ct_sctp_timeout_closed = 10 SECS;
-static unsigned int nf_ct_sctp_timeout_cookie_wait = 3 SECS;
-static unsigned int nf_ct_sctp_timeout_cookie_echoed = 3 SECS;
-static unsigned int nf_ct_sctp_timeout_established = 5 DAYS;
-static unsigned int nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
-static unsigned int nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
-static unsigned int nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
-
-static unsigned int * sctp_timeouts[]
-= { NULL, /* SCTP_CONNTRACK_NONE */
- &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
- &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
- &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
- &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
- &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
- &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
- &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
- };
+static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
+ [SCTP_CONNTRACK_CLOSED] = 10 SECS,
+ [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
+ [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
+};
#define sNO SCTP_CONNTRACK_NONE
#define sCL SCTP_CONNTRACK_CLOSED
@@ -91,39 +69,39 @@ static unsigned int * sctp_timeouts[]
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define sIV SCTP_CONNTRACK_MAX
-/*
+/*
These are the descriptions of the states:
-NOTE: These state names are tantalizingly similar to the states of an
+NOTE: These state names are tantalizingly similar to the states of an
SCTP endpoint. But the interpretation of the states is a little different,
-considering that these are the states of the connection and not of an end
+considering that these are the states of the connection and not of an end
point. Please note the subtleties. -Kiran
NONE - Nothing so far.
-COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
- an INIT_ACK chunk in the reply direction.
+COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
+ an INIT_ACK chunk in the reply direction.
COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
- to that of the SHUTDOWN chunk.
-CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
- the SHUTDOWN chunk. Connection is closed.
+ to that of the SHUTDOWN chunk.
+CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
+ the SHUTDOWN chunk. Connection is closed.
*/
/* TODO
- - I have assumed that the first INIT is in the original direction.
+ - I have assumed that the first INIT is in the original direction.
This messes things when an INIT comes in the reply direction in CLOSED
state.
- - Check the error type in the reply dir before transitioning from
+ - Check the error type in the reply dir before transitioning from
cookie echoed to closed.
- Sec 5.2.4 of RFC 2960
- Multi Homing support.
*/
/* SCTP conntrack state transitions */
-static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
@@ -132,9 +110,9 @@ static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
},
{
@@ -146,327 +124,327 @@ static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
}
};
-static int sctp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+static int sctp_net_id __read_mostly;
+struct sctp_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[SCTP_CONNTRACK_MAX];
+};
+
+static inline struct sctp_net *sctp_pernet(struct net *net)
{
- sctp_sctphdr_t _hdr, *hp;
+ return net_generic(net, sctp_net_id);
+}
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
+static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct sctphdr *hp;
+ struct sctphdr _hdr;
/* Actually only need first 8 bytes. */
hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
if (hp == NULL)
- return 0;
+ return false;
tuple->src.u.sctp.port = hp->source;
tuple->dst.u.sctp.port = hp->dest;
- return 1;
+ return true;
}
-static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
tuple->src.u.sctp.port = orig->dst.u.sctp.port;
tuple->dst.u.sctp.port = orig->src.u.sctp.port;
- return 1;
+ return true;
}
/* Print out the per-protocol part of the tuple. */
static int sctp_print_tuple(struct seq_file *s,
const struct nf_conntrack_tuple *tuple)
{
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
return seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.sctp.port),
ntohs(tuple->dst.u.sctp.port));
}
/* Print out the private part of the conntrack. */
-static int sctp_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
+static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
enum sctp_conntrack state;
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- read_lock_bh(&sctp_lock);
- state = conntrack->proto.sctp.state;
- read_unlock_bh(&sctp_lock);
+ spin_lock_bh(&ct->lock);
+ state = ct->proto.sctp.state;
+ spin_unlock_bh(&ct->lock);
return seq_printf(s, "%s ", sctp_conntrack_names[state]);
}
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
-for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \
- offset < skb->len && \
- (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
- offset += (htons(sch->length) + 3) & ~3, count++)
+for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \
+ (offset) < (skb)->len && \
+ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \
+ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
/* Some validity checks to make sure the chunks are fine */
-static int do_basic_checks(struct nf_conn *conntrack,
+static int do_basic_checks(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- char *map)
+ unsigned long *map)
{
u_int32_t offset, count;
sctp_chunkhdr_t _sch, *sch;
int flag;
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
flag = 0;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
+ pr_debug("Chunk Num: %d Type: %d\n", count, sch->type);
- if (sch->type == SCTP_CID_INIT
- || sch->type == SCTP_CID_INIT_ACK
- || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ if (sch->type == SCTP_CID_INIT ||
+ sch->type == SCTP_CID_INIT_ACK ||
+ sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
flag = 1;
- }
- /* Cookie Ack/Echo chunks not the first OR
- Init / Init Ack / Shutdown compl chunks not the only chunks */
- if ((sch->type == SCTP_CID_COOKIE_ACK
- || sch->type == SCTP_CID_COOKIE_ECHO
- || flag)
- && count !=0 ) {
- DEBUGP("Basic checks failed\n");
+ /*
+ * Cookie Ack/Echo chunks not the first OR
+ * Init / Init Ack / Shutdown compl chunks not the only chunks
+ * OR zero-length.
+ */
+ if (((sch->type == SCTP_CID_COOKIE_ACK ||
+ sch->type == SCTP_CID_COOKIE_ECHO ||
+ flag) &&
+ count != 0) || !sch->length) {
+ pr_debug("Basic checks failed\n");
return 1;
}
- if (map) {
- set_bit(sch->type, (void *)map);
- }
+ if (map)
+ set_bit(sch->type, map);
}
- DEBUGP("Basic checks passed\n");
- return 0;
+ pr_debug("Basic checks passed\n");
+ return count == 0;
}
-static int new_state(enum ip_conntrack_dir dir,
- enum sctp_conntrack cur_state,
- int chunk_type)
+static int sctp_new_state(enum ip_conntrack_dir dir,
+ enum sctp_conntrack cur_state,
+ int chunk_type)
{
int i;
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
-
- DEBUGP("Chunk type: %d\n", chunk_type);
+ pr_debug("Chunk type: %d\n", chunk_type);
switch (chunk_type) {
- case SCTP_CID_INIT:
- DEBUGP("SCTP_CID_INIT\n");
- i = 0; break;
- case SCTP_CID_INIT_ACK:
- DEBUGP("SCTP_CID_INIT_ACK\n");
- i = 1; break;
- case SCTP_CID_ABORT:
- DEBUGP("SCTP_CID_ABORT\n");
- i = 2; break;
- case SCTP_CID_SHUTDOWN:
- DEBUGP("SCTP_CID_SHUTDOWN\n");
- i = 3; break;
- case SCTP_CID_SHUTDOWN_ACK:
- DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
- i = 4; break;
- case SCTP_CID_ERROR:
- DEBUGP("SCTP_CID_ERROR\n");
- i = 5; break;
- case SCTP_CID_COOKIE_ECHO:
- DEBUGP("SCTP_CID_COOKIE_ECHO\n");
- i = 6; break;
- case SCTP_CID_COOKIE_ACK:
- DEBUGP("SCTP_CID_COOKIE_ACK\n");
- i = 7; break;
- case SCTP_CID_SHUTDOWN_COMPLETE:
- DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
- i = 8; break;
- default:
- /* Other chunks like DATA, SACK, HEARTBEAT and
- its ACK do not cause a change in state */
- DEBUGP("Unknown chunk type, Will stay in %s\n",
- sctp_conntrack_names[cur_state]);
- return cur_state;
+ case SCTP_CID_INIT:
+ pr_debug("SCTP_CID_INIT\n");
+ i = 0;
+ break;
+ case SCTP_CID_INIT_ACK:
+ pr_debug("SCTP_CID_INIT_ACK\n");
+ i = 1;
+ break;
+ case SCTP_CID_ABORT:
+ pr_debug("SCTP_CID_ABORT\n");
+ i = 2;
+ break;
+ case SCTP_CID_SHUTDOWN:
+ pr_debug("SCTP_CID_SHUTDOWN\n");
+ i = 3;
+ break;
+ case SCTP_CID_SHUTDOWN_ACK:
+ pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
+ i = 4;
+ break;
+ case SCTP_CID_ERROR:
+ pr_debug("SCTP_CID_ERROR\n");
+ i = 5;
+ break;
+ case SCTP_CID_COOKIE_ECHO:
+ pr_debug("SCTP_CID_COOKIE_ECHO\n");
+ i = 6;
+ break;
+ case SCTP_CID_COOKIE_ACK:
+ pr_debug("SCTP_CID_COOKIE_ACK\n");
+ i = 7;
+ break;
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
+ i = 8;
+ break;
+ default:
+ /* Other chunks like DATA, SACK, HEARTBEAT and
+ its ACK do not cause a change in state */
+ pr_debug("Unknown chunk type, Will stay in %s\n",
+ sctp_conntrack_names[cur_state]);
+ return cur_state;
}
- DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
- dir, sctp_conntrack_names[cur_state], chunk_type,
- sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+ pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
+ dir, sctp_conntrack_names[cur_state], chunk_type,
+ sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
return sctp_conntracks[dir][i][cur_state];
}
-/* Returns verdict for packet, or -1 for invalid. */
-static int sctp_packet(struct nf_conn *conntrack,
+static unsigned int *sctp_get_timeouts(struct net *net)
+{
+ return sctp_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
+static int sctp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
- enum sctp_conntrack newconntrack, oldsctpstate;
- sctp_sctphdr_t _sctph, *sh;
- sctp_chunkhdr_t _sch, *sch;
+ enum sctp_conntrack new_state, old_state;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ const struct sctphdr *sh;
+ struct sctphdr _sctph;
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
u_int32_t offset, count;
- char map[256 / sizeof (char)] = {0};
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
+ unsigned long map[256 / sizeof(unsigned long)] = { 0 };
sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
if (sh == NULL)
- return -1;
+ goto out;
- if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
- return -1;
+ if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ goto out;
/* Check the verification tag (Sec 8.5) */
- if (!test_bit(SCTP_CID_INIT, (void *)map)
- && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
- && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
- && !test_bit(SCTP_CID_ABORT, (void *)map)
- && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
- && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
- DEBUGP("Verification tag check failed\n");
- return -1;
+ if (!test_bit(SCTP_CID_INIT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+ !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+ !test_bit(SCTP_CID_ABORT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out;
}
- oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
+ old_state = new_state = SCTP_CONNTRACK_NONE;
+ spin_lock_bh(&ct->lock);
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- write_lock_bh(&sctp_lock);
-
/* Special cases of Verification tag check (Sec 8.5.1) */
if (sch->type == SCTP_CID_INIT) {
/* Sec 8.5.1 (A) */
- if (sh->vtag != 0) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
+ if (sh->vtag != 0)
+ goto out_unlock;
} else if (sch->type == SCTP_CID_ABORT) {
/* Sec 8.5.1 (B) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
- && !(sh->vtag == conntrack->proto.sctp.vtag
- [1 - CTINFO2DIR(ctinfo)])) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
+ if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+ sh->vtag != ct->proto.sctp.vtag[!dir])
+ goto out_unlock;
} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
/* Sec 8.5.1 (C) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
- && !(sh->vtag == conntrack->proto.sctp.vtag
- [1 - CTINFO2DIR(ctinfo)]
- && (sch->flags & 1))) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
+ if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+ sh->vtag != ct->proto.sctp.vtag[!dir] &&
+ sch->flags & SCTP_CHUNK_FLAG_T)
+ goto out_unlock;
} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
/* Sec 8.5.1 (D) */
- if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
+ if (sh->vtag != ct->proto.sctp.vtag[dir])
+ goto out_unlock;
}
- oldsctpstate = conntrack->proto.sctp.state;
- newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
+ old_state = ct->proto.sctp.state;
+ new_state = sctp_new_state(dir, old_state, sch->type);
/* Invalid */
- if (newconntrack == SCTP_CONNTRACK_MAX) {
- DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
- CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
- write_unlock_bh(&sctp_lock);
- return -1;
+ if (new_state == SCTP_CONNTRACK_MAX) {
+ pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
+ "conntrack=%u\n",
+ dir, sch->type, old_state);
+ goto out_unlock;
}
/* If it is an INIT or an INIT ACK note down the vtag */
- if (sch->type == SCTP_CID_INIT
- || sch->type == SCTP_CID_INIT_ACK) {
+ if (sch->type == SCTP_CID_INIT ||
+ sch->type == SCTP_CID_INIT_ACK) {
sctp_inithdr_t _inithdr, *ih;
ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL) {
- write_unlock_bh(&sctp_lock);
- return -1;
- }
- DEBUGP("Setting vtag %x for dir %d\n",
- ih->init_tag, !CTINFO2DIR(ctinfo));
- conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL)
+ goto out_unlock;
+ pr_debug("Setting vtag %x for dir %d\n",
+ ih->init_tag, !dir);
+ ct->proto.sctp.vtag[!dir] = ih->init_tag;
}
- conntrack->proto.sctp.state = newconntrack;
- if (oldsctpstate != newconntrack)
- nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
- write_unlock_bh(&sctp_lock);
+ ct->proto.sctp.state = new_state;
+ if (old_state != new_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
}
+ spin_unlock_bh(&ct->lock);
- nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
- if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
- && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
- && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
- DEBUGP("Setting assured bit\n");
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
- nf_conntrack_event_cache(IPCT_STATUS, skb);
+ if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
+ dir == IP_CT_DIR_REPLY &&
+ new_state == SCTP_CONNTRACK_ESTABLISHED) {
+ pr_debug("Setting assured bit\n");
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
}
return NF_ACCEPT;
+
+out_unlock:
+ spin_unlock_bh(&ct->lock);
+out:
+ return -NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
-static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
- unsigned int dataoff)
+static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
{
- enum sctp_conntrack newconntrack;
- sctp_sctphdr_t _sctph, *sh;
- sctp_chunkhdr_t _sch, *sch;
+ enum sctp_conntrack new_state;
+ const struct sctphdr *sh;
+ struct sctphdr _sctph;
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
u_int32_t offset, count;
- char map[256 / sizeof (char)] = {0};
-
- DEBUGP(__FUNCTION__);
- DEBUGP("\n");
+ unsigned long map[256 / sizeof(unsigned long)] = { 0 };
sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
if (sh == NULL)
- return 0;
+ return false;
- if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
- return 0;
+ if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ return false;
/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
- if ((test_bit (SCTP_CID_ABORT, (void *)map))
- || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
- || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
- return 0;
- }
+ if (test_bit(SCTP_CID_ABORT, map) ||
+ test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
+ test_bit(SCTP_CID_COOKIE_ACK, map))
+ return false;
- newconntrack = SCTP_CONNTRACK_MAX;
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+ new_state = SCTP_CONNTRACK_MAX;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Don't need lock here: this conntrack not in circulation yet */
- newconntrack = new_state(IP_CT_DIR_ORIGINAL,
- SCTP_CONNTRACK_NONE, sch->type);
+ new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
+ SCTP_CONNTRACK_NONE, sch->type);
/* Invalid: delete conntrack */
- if (newconntrack == SCTP_CONNTRACK_MAX) {
- DEBUGP("nf_conntrack_sctp: invalid new deleting.\n");
- return 0;
+ if (new_state == SCTP_CONNTRACK_NONE ||
+ new_state == SCTP_CONNTRACK_MAX) {
+ pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
+ return false;
}
/* Copy the vtag into the state info */
@@ -475,196 +453,476 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
sctp_inithdr_t _inithdr, *ih;
ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
- sizeof(_inithdr), &_inithdr);
+ sizeof(_inithdr), &_inithdr);
if (ih == NULL)
- return 0;
+ return false;
- DEBUGP("Setting vtag %x for new conn\n",
- ih->init_tag);
+ pr_debug("Setting vtag %x for new conn\n",
+ ih->init_tag);
- conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
ih->init_tag;
} else {
/* Sec 8.5.1 (A) */
- return 0;
+ return false;
}
}
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
else {
- DEBUGP("Setting vtag %x for new conn OOTB\n",
- sh->vtag);
- conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+ pr_debug("Setting vtag %x for new conn OOTB\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
}
- conntrack->proto.sctp.state = newconntrack;
+ ct->proto.sctp.state = new_state;
}
- return 1;
+ return true;
}
-struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = {
- .l3proto = PF_INET,
- .proto = IPPROTO_SCTP,
- .name = "sctp",
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
- .print_tuple = sctp_print_tuple,
- .print_conntrack = sctp_print_conntrack,
- .packet = sctp_packet,
- .new = sctp_new,
- .destroy = NULL,
- .me = THIS_MODULE
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ct->lock);
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
+ return -1;
+}
+
+static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = {
+ [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 },
+ [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 },
};
-struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = {
- .l3proto = PF_INET6,
- .proto = IPPROTO_SCTP,
- .name = "sctp",
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
- .print_tuple = sctp_print_tuple,
- .print_conntrack = sctp_print_conntrack,
- .packet = sctp_packet,
- .new = sctp_new,
- .destroy = NULL,
- .me = THIS_MODULE
+static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
+{
+ struct nlattr *attr = cda[CTA_PROTOINFO_SCTP];
+ struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1];
+ int err;
+
+ /* updates may not contain the internal protocol info, skip parsing */
+ if (!attr)
+ return 0;
+
+ err = nla_parse_nested(tb,
+ CTA_PROTOINFO_SCTP_MAX,
+ attr,
+ sctp_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_PROTOINFO_SCTP_STATE] ||
+ !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] ||
+ !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])
+ return -EINVAL;
+
+ spin_lock_bh(&ct->lock);
+ ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] =
+ nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]);
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]);
+ spin_unlock_bh(&ct->lock);
+
+ return 0;
+}
+
+static int sctp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_SCTP */
+ + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct sctp_net *sn = sctp_pernet(net);
+ int i;
+
+ /* set default SCTP timeouts. */
+ for (i=0; i<SCTP_CONNTRACK_MAX; i++)
+ timeouts[i] = sn->timeouts[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
+ [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static ctl_table nf_ct_sysctl_table[] = {
+static struct ctl_table sctp_sysctl_table[] = {
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
.procname = "nf_conntrack_sctp_timeout_closed",
- .data = &nf_ct_sctp_timeout_closed,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
.procname = "nf_conntrack_sctp_timeout_cookie_wait",
- .data = &nf_ct_sctp_timeout_cookie_wait,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
.procname = "nf_conntrack_sctp_timeout_cookie_echoed",
- .data = &nf_ct_sctp_timeout_cookie_echoed,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
.procname = "nf_conntrack_sctp_timeout_established",
- .data = &nf_ct_sctp_timeout_established,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
.procname = "nf_conntrack_sctp_timeout_shutdown_sent",
- .data = &nf_ct_sctp_timeout_shutdown_sent,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
.procname = "nf_conntrack_sctp_timeout_shutdown_recd",
- .data = &nf_ct_sctp_timeout_shutdown_recd,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
.procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
- .data = &nf_ct_sctp_timeout_shutdown_ack_sent,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
- { .ctl_name = 0 }
+ { }
};
-static ctl_table nf_ct_netfilter_table[] = {
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table sctp_compat_sysctl_table[] = {
{
- .ctl_name = NET_NETFILTER,
- .procname = "netfilter",
- .mode = 0555,
- .child = nf_ct_sysctl_table,
+ .procname = "ip_conntrack_sctp_timeout_closed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_cookie_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
},
- { .ctl_name = 0 }
-};
-
-static ctl_table nf_ct_net_table[] = {
{
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = nf_ct_netfilter_table,
+ .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
},
- { .ctl_name = 0 }
+ {
+ .procname = "ip_conntrack_sctp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif
-static struct ctl_table_header *nf_ct_sysctl_header;
+static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(sctp_sysctl_table,
+ sizeof(sctp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
#endif
+ return 0;
+}
-int __init init(void)
+static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
+ sizeof(sctp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+#endif
+#endif
+ return 0;
+}
+
+static int sctp_init_net(struct net *net, u_int16_t proto)
{
int ret;
+ struct sctp_net *sn = sctp_pernet(net);
+ struct nf_proto_net *pn = &sn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
+ sn->timeouts[i] = sctp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
+ if (ret < 0)
+ return ret;
- ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4);
- if (ret) {
- printk("nf_conntrack_proto_sctp4: protocol register failed\n");
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+
+ return ret;
+}
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
+ .new = sctp_new,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = sctp_to_nlattr,
+ .nlattr_size = sctp_nlattr_size,
+ .from_nlattr = nlattr_to_sctp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
+ .new = sctp_new,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = sctp_to_nlattr,
+ .nlattr_size = sctp_nlattr_size,
+ .from_nlattr = nlattr_to_sctp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
+};
+
+static int sctp_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
goto out;
}
- ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6);
- if (ret) {
- printk("nf_conntrack_proto_sctp6: protocol register failed\n");
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
goto cleanup_sctp4;
}
+ return 0;
-#ifdef CONFIG_SYSCTL
- nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
- if (nf_ct_sysctl_header == NULL) {
- printk("nf_conntrack_proto_sctp: can't register to sysctl.\n");
- goto cleanup;
- }
-#endif
-
+cleanup_sctp4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+out:
return ret;
+}
-#ifdef CONFIG_SYSCTL
- cleanup:
- nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
-#endif
- cleanup_sctp4:
- nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
- out:
- DEBUGP("SCTP conntrack module loading %s\n",
- ret ? "failed": "succeeded");
+static void sctp_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+}
+
+static struct pernet_operations sctp_net_ops = {
+ .init = sctp_net_init,
+ .exit = sctp_net_exit,
+ .id = &sctp_net_id,
+ .size = sizeof(struct sctp_net),
+};
+
+static int __init nf_conntrack_proto_sctp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&sctp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
+ if (ret < 0)
+ goto out_sctp4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
+ if (ret < 0)
+ goto out_sctp6;
+
+ return 0;
+out_sctp6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+out_sctp4:
+ unregister_pernet_subsys(&sctp_net_ops);
+out_pernet:
return ret;
}
-void __exit fini(void)
+static void __exit nf_conntrack_proto_sctp_fini(void)
{
- nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
- nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(nf_ct_sysctl_header);
-#endif
- DEBUGP("SCTP conntrack module unloaded\n");
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+ unregister_pernet_subsys(&sctp_net_ops);
}
-module_init(init);
-module_exit(fini);
+module_init(nf_conntrack_proto_sctp_init);
+module_exit(nf_conntrack_proto_sctp_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kiran Kumar Immidi");
MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
+MODULE_ALIAS("ip_conntrack_proto_sctp");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6492ed66fb3..44d1ea32570 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,34 +1,15 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
- * - Real stateful connection tracking
- * - Modified state transitions table
- * - Window scaling support added
- * - SACK support added
- *
- * Willy Tarreau:
- * - State table bugfixes
- * - More robust state changes
- * - Tuning timer parameters
- *
- * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - genelized Layer 3 protocol part.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c
- *
- * version 2.2
*/
-#include <linux/config.h>
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/timer.h>
-#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/tcp.h>
@@ -36,6 +17,7 @@
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
+#include <asm/unaligned.h>
#include <net/tcp.h>
@@ -43,39 +25,32 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
-
-#if 0
-#define DEBUGP printk
-#define DEBUGP_VARS
-#else
-#define DEBUGP(format, args...)
-#endif
-
-/* Protects conntrack->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
-/* "Be conservative in what you do,
- be liberal in what you accept from others."
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+
+/* "Be conservative in what you do,
+ be liberal in what you accept from others."
If it's non-zero, we mark only out of window RST segments as INVALID. */
-int nf_ct_tcp_be_liberal = 0;
+static int nf_ct_tcp_be_liberal __read_mostly = 0;
-/* When connection is picked up from the middle, how many packets are required
- to pass in each direction when we assume we are in sync - if any side uses
- window scaling, we lost the game.
- If it is set to zero, we disable picking up already established
+/* If it is set to zero, we disable picking up already established
connections. */
-int nf_ct_tcp_loose = 3;
+static int nf_ct_tcp_loose __read_mostly = 1;
-/* Max number of the retransmitted packets without receiving an (acceptable)
- ACK from the destination. If this number is reached, a shorter timer
+/* Max number of the retransmitted packets without receiving an (acceptable)
+ ACK from the destination. If this number is reached, a shorter timer
will be started. */
-int nf_ct_tcp_max_retrans = 3;
+static int nf_ct_tcp_max_retrans __read_mostly = 3;
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR */
-static const char *tcp_conntrack_names[] = {
+static const char *const tcp_conntrack_names[] = {
"NONE",
"SYN_SENT",
"SYN_RECV",
@@ -85,41 +60,31 @@ static const char *tcp_conntrack_names[] = {
"LAST_ACK",
"TIME_WAIT",
"CLOSE",
- "LISTEN"
+ "SYN_SENT2",
};
-
+
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
#define DAYS * 24 HOURS
-unsigned int nf_ct_tcp_timeout_syn_sent = 2 MINS;
-unsigned int nf_ct_tcp_timeout_syn_recv = 60 SECS;
-unsigned int nf_ct_tcp_timeout_established = 5 DAYS;
-unsigned int nf_ct_tcp_timeout_fin_wait = 2 MINS;
-unsigned int nf_ct_tcp_timeout_close_wait = 60 SECS;
-unsigned int nf_ct_tcp_timeout_last_ack = 30 SECS;
-unsigned int nf_ct_tcp_timeout_time_wait = 2 MINS;
-unsigned int nf_ct_tcp_timeout_close = 10 SECS;
-
+static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
+ [TCP_CONNTRACK_SYN_SENT] = 2 MINS,
+ [TCP_CONNTRACK_SYN_RECV] = 60 SECS,
+ [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [TCP_CONNTRACK_FIN_WAIT] = 2 MINS,
+ [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS,
+ [TCP_CONNTRACK_LAST_ACK] = 30 SECS,
+ [TCP_CONNTRACK_TIME_WAIT] = 2 MINS,
+ [TCP_CONNTRACK_CLOSE] = 10 SECS,
+ [TCP_CONNTRACK_SYN_SENT2] = 2 MINS,
/* RFC1122 says the R2 limit should be at least 100 seconds.
- Linux uses 15 packets as limit, which corresponds
+ Linux uses 15 packets as limit, which corresponds
to ~13-30min depending on RTO. */
-unsigned int nf_ct_tcp_timeout_max_retrans = 5 MINS;
-
-static unsigned int * tcp_timeouts[]
-= { NULL, /* TCP_CONNTRACK_NONE */
- &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
- &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
- &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
- &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
- &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
- &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
- &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
- &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
- NULL, /* TCP_CONNTRACK_LISTEN */
- };
-
+ [TCP_CONNTRACK_RETRANS] = 5 MINS,
+ [TCP_CONNTRACK_UNACK] = 5 MINS,
+};
+
#define sNO TCP_CONNTRACK_NONE
#define sSS TCP_CONNTRACK_SYN_SENT
#define sSR TCP_CONNTRACK_SYN_RECV
@@ -129,7 +94,7 @@ static unsigned int * tcp_timeouts[]
#define sLA TCP_CONNTRACK_LAST_ACK
#define sTW TCP_CONNTRACK_TIME_WAIT
#define sCL TCP_CONNTRACK_CLOSE
-#define sLI TCP_CONNTRACK_LISTEN
+#define sS2 TCP_CONNTRACK_SYN_SENT2
#define sIV TCP_CONNTRACK_MAX
#define sIG TCP_CONNTRACK_IGNORE
@@ -142,13 +107,13 @@ enum tcp_bit_set {
TCP_RST_SET,
TCP_NONE_SET,
};
-
+
/*
* The TCP state transition table needs a few words...
*
* We are the man in the middle. All the packets go through us
* but might get lost in transit to the destination.
- * It is assumed that the destinations can't receive segments
+ * It is assumed that the destinations can't receive segments
* we haven't seen.
*
* The checked segment is in window, but our windows are *not*
@@ -158,37 +123,36 @@ enum tcp_bit_set {
* The meaning of the states are:
*
* NONE: initial state
- * SYN_SENT: SYN-only packet seen
+ * SYN_SENT: SYN-only packet seen
+ * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open
* SYN_RECV: SYN-ACK packet seen
* ESTABLISHED: ACK packet seen
* FIN_WAIT: FIN packet seen
- * CLOSE_WAIT: ACK seen (after FIN)
+ * CLOSE_WAIT: ACK seen (after FIN)
* LAST_ACK: FIN seen (after FIN)
* TIME_WAIT: last ACK seen
- * CLOSE: closed connection
- *
- * LISTEN state is not used.
+ * CLOSE: closed connection (RST)
*
* Packets marked as IGNORED (sIG):
- * if they may be either invalid or valid
- * and the receiver may send back a connection
+ * if they may be either invalid or valid
+ * and the receiver may send back a connection
* closing RST or a SYN/ACK.
*
* Packets marked as INVALID (sIV):
- * if they are invalid
- * or we do not support the request (simultaneous open)
+ * if we regard them as truly invalid packets
*/
-static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
/*
* sNO -> sSS Initialize a new connection
* sSS -> sSS Retransmitted SYN
- * sSR -> sIG Late retransmitted SYN?
+ * sS2 -> sS2 Late retransmitted SYN
+ * sSR -> sIG
* sES -> sIG Error: SYNs in window outside the SYN_SENT state
- * are errors. Receiver will reply with RST
+ * are errors. Receiver will reply with RST
* and close the connection.
* Or we are not in sync and hold a dead connection.
* sFW -> sIG
@@ -197,37 +161,43 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sTW -> sSS Reopened connection (RFC 1122).
* sCL -> sSS
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
/*
- * A SYN/ACK from the client is always invalid:
- * - either it tries to set up a simultaneous open, which is
- * not supported;
- * - or the firewall has just been inserted between the two hosts
- * during the session set-up. The SYN will be retransmitted
- * by the true client (or it'll time out).
+ * sNO -> sIV Too late and no reason to do anything
+ * sSS -> sIV Client can't send SYN and then SYN/ACK
+ * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open
+ * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open
+ * sES -> sIV Invalid SYN/ACK packets sent by the client
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sIV
+ * sCL -> sIV
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
* sNO -> sIV Too late and no reason to do anything...
* sSS -> sIV Client migth not send FIN in this state:
* we enforce waiting for a SYN/ACK reply first.
+ * sS2 -> sIV
* sSR -> sFW Close started.
* sES -> sFW
* sFW -> sLA FIN seen in both directions, waiting for
- * the last ACK.
+ * the last ACK.
* Migth be a retransmitted FIN as well...
* sCW -> sLA
* sLA -> sLA Retransmitted FIN. Remain in the same state.
* sTW -> sTW
* sCL -> sCL
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
* sNO -> sES Assumed.
* sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
+ * sS2 -> sIV
* sSR -> sES Established state is reached.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
@@ -236,30 +206,32 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sTW -> sTW Retransmitted last ACK. Remain in the same state.
* sCL -> sCL
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
},
{
/* REPLY */
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 },
/*
* sNO -> sIV Never reached.
- * sSS -> sIV Simultaneous open, not supported
- * sSR -> sIV Simultaneous open, not supported.
- * sES -> sIV Server may not initiate a connection.
+ * sSS -> sS2 Simultaneous open
+ * sS2 -> sS2 Retransmitted simultaneous SYN
+ * sSR -> sIV Invalid SYN packets sent by the server
+ * sES -> sIV
* sFW -> sIV
* sCW -> sIV
* sLA -> sIV
* sTW -> sIV Reopened connection, but server may not do it.
* sCL -> sIV
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
/*
* sSS -> sSR Standard open.
- * sSR -> sSR Retransmitted SYN/ACK.
+ * sS2 -> sSR Simultaneous open
+ * sSR -> sIG Retransmitted SYN/ACK, ignore it.
* sES -> sIG Late retransmitted SYN/ACK?
* sFW -> sIG Might be SYN/ACK answering ignored SYN
* sCW -> sIG
@@ -267,10 +239,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sTW -> sIG
* sCL -> sIG
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
* sSS -> sIV Server might not send FIN in this state.
+ * sS2 -> sIV
* sSR -> sFW Close started.
* sES -> sFW
* sFW -> sLA FIN seen in both directions.
@@ -279,10 +252,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sTW -> sTW
* sCL -> sCL
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
/*
* sSS -> sIG Might be a half-open connection.
+ * sS2 -> sIG
* sSR -> sSR Might answer late resent SYN.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
@@ -291,35 +265,40 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sTW -> sTW Retransmitted last ACK.
* sCL -> sCL
*/
-/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
-/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
- }
+ }
};
-static int tcp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+static inline struct nf_tcp_net *tcp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp;
+}
+
+static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
{
- struct tcphdr _hdr, *hp;
+ const struct tcphdr *hp;
+ struct tcphdr _hdr;
/* Actually only need first 8 bytes. */
hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
if (hp == NULL)
- return 0;
+ return false;
tuple->src.u.tcp.port = hp->source;
tuple->dst.u.tcp.port = hp->dest;
- return 1;
+ return true;
}
-static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
tuple->src.u.tcp.port = orig->dst.u.tcp.port;
tuple->dst.u.tcp.port = orig->src.u.tcp.port;
- return 1;
+ return true;
}
/* Print out the per-protocol part of the tuple. */
@@ -332,14 +311,13 @@ static int tcp_print_tuple(struct seq_file *s,
}
/* Print out the private part of the conntrack. */
-static int tcp_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
+static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
enum tcp_conntrack state;
- read_lock_bh(&tcp_lock);
- state = conntrack->proto.tcp.state;
- read_unlock_bh(&tcp_lock);
+ spin_lock_bh(&ct->lock);
+ state = ct->proto.tcp.state;
+ spin_unlock_bh(&ct->lock);
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
@@ -355,59 +333,60 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph)
/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
in IP Filter' by Guido van Rooij.
-
- http://www.nluug.nl/events/sane2000/papers.html
- http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
-
+
+ http://www.sane.nl/events/sane2000/papers.html
+ http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
+
The boundaries and the conditions are changed according to RFC793:
the packet must intersect the window (i.e. segments may be
after the right or before the left edge) and thus receivers may ACK
segments after the right edge of the window.
- td_maxend = max(sack + max(win,1)) seen in reply packets
+ td_maxend = max(sack + max(win,1)) seen in reply packets
td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
td_maxwin += seq + len - sender.td_maxend
if seq + len > sender.td_maxend
td_end = max(seq + len) seen in sent packets
-
+
I. Upper bound for valid data: seq <= sender.td_maxend
II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
- III. Upper bound for valid ack: sack <= receiver.td_end
- IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
+ III. Upper bound for valid (s)ack: sack <= receiver.td_end
+ IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW
- where sack is the highest right edge of sack block found in the packet.
+ where sack is the highest right edge of sack block found in the packet
+ or ack in the case of packet without SACK option.
- The upper bound limit for a valid ack is not ignored -
- we doesn't have to deal with fragments.
+ The upper bound limit for a valid (s)ack is not ignored -
+ we doesn't have to deal with fragments.
*/
static inline __u32 segment_seq_plus_len(__u32 seq,
size_t len,
unsigned int dataoff,
- struct tcphdr *tcph)
+ const struct tcphdr *tcph)
{
/* XXX Should I use payload length field in IP/IPv6 header ?
* - YK */
return (seq + len - dataoff - tcph->doff*4
+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
}
-
+
/* Fixme: what about big packets? */
#define MAXACKWINCONST 66000
#define MAXACKWINDOW(sender) \
((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
: MAXACKWINCONST)
-
+
/*
* Simplified tcp_parse_options routine from tcp_input.c
*/
static void tcp_options(const struct sk_buff *skb,
unsigned int dataoff,
- struct tcphdr *tcph,
+ const struct tcphdr *tcph,
struct ip_ct_tcp_state *state)
{
unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
- unsigned char *ptr;
+ const unsigned char *ptr;
int length = (tcph->doff*4) - sizeof(struct tcphdr);
if (!length)
@@ -417,7 +396,7 @@ static void tcp_options(const struct sk_buff *skb,
length, buff);
BUG_ON(ptr == NULL);
- state->td_scale =
+ state->td_scale =
state->flags = 0;
while (length > 0) {
@@ -435,9 +414,9 @@ static void tcp_options(const struct sk_buff *skb,
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
- break; /* don't parse partial options */
+ return; /* don't parse partial options */
- if (opcode == TCPOPT_SACK_PERM
+ if (opcode == TCPOPT_SACK_PERM
&& opsize == TCPOLEN_SACK_PERM)
state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
else if (opcode == TCPOPT_WINDOW
@@ -458,10 +437,10 @@ static void tcp_options(const struct sk_buff *skb,
}
static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
- struct tcphdr *tcph, __u32 *sack)
+ const struct tcphdr *tcph, __u32 *sack)
{
- unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
- unsigned char *ptr;
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ const unsigned char *ptr;
int length = (tcph->doff*4) - sizeof(struct tcphdr);
__u32 tmp;
@@ -473,12 +452,11 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
BUG_ON(ptr == NULL);
/* Fast path for timestamp-only option */
- if (length == TCPOLEN_TSTAMP_ALIGNED*4
- && *(__u32 *)ptr ==
- __constant_ntohl((TCPOPT_NOP << 24)
- | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8)
- | TCPOLEN_TIMESTAMP))
+ if (length == TCPOLEN_TSTAMP_ALIGNED
+ && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
+ | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
return;
while (length > 0) {
@@ -496,19 +474,17 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
- break; /* don't parse partial options */
-
- if (opcode == TCPOPT_SACK
- && opsize >= (TCPOLEN_SACK_BASE
- + TCPOLEN_SACK_PERBLOCK)
- && !((opsize - TCPOLEN_SACK_BASE)
- % TCPOLEN_SACK_PERBLOCK)) {
- for (i = 0;
- i < (opsize - TCPOLEN_SACK_BASE);
- i += TCPOLEN_SACK_PERBLOCK) {
- memcpy(&tmp, (__u32 *)(ptr + i) + 1,
- sizeof(__u32));
- tmp = ntohl(tmp);
+ return; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK
+ && opsize >= (TCPOLEN_SACK_BASE
+ + TCPOLEN_SACK_PERBLOCK)
+ && !((opsize - TCPOLEN_SACK_BASE)
+ % TCPOLEN_SACK_PERBLOCK)) {
+ for (i = 0;
+ i < (opsize - TCPOLEN_SACK_BASE);
+ i += TCPOLEN_SACK_PERBLOCK) {
+ tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
if (after(tmp, *sack))
*sack = tmp;
@@ -521,18 +497,23 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-static int tcp_in_window(struct ip_ct_tcp *state,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- unsigned int dataoff,
- struct tcphdr *tcph,
- int pf)
+static bool tcp_in_window(const struct nf_conn *ct,
+ struct ip_ct_tcp *state,
+ enum ip_conntrack_dir dir,
+ unsigned int index,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u_int8_t pf)
{
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+ const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
- int res;
+ s32 receiver_offset;
+ bool res, in_recv_win;
/*
* Get the required data from the packet.
@@ -545,50 +526,66 @@ static int tcp_in_window(struct ip_ct_tcp *state,
if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
tcp_sack(skb, dataoff, tcph, &sack);
- DEBUGP("tcp_in_window: START\n");
- DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "seq=%u ack=%u sack=%u win=%u end=%u\n",
- NIPQUAD(iph->saddr), ntohs(tcph->source),
- NIPQUAD(iph->daddr), ntohs(tcph->dest),
- seq, ack, sack, win, end);
- DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
- if (sender->td_end == 0) {
+ /* Take into account NAT sequence number mangling */
+ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
+ ack -= receiver_offset;
+ sack -= receiver_offset;
+
+ pr_debug("tcp_in_window: START\n");
+ pr_debug("tcp_in_window: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+ seq, ack, receiver_offset, sack, receiver_offset, win, end);
+ pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ if (sender->td_maxwin == 0) {
/*
* Initialize sender data.
*/
- if (tcph->syn && tcph->ack) {
+ if (tcph->syn) {
/*
- * Outgoing SYN-ACK in reply to a SYN.
+ * SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
*/
- sender->td_end =
+ sender->td_end =
sender->td_maxend = end;
sender->td_maxwin = (win == 0 ? 1 : win);
tcp_options(skb, dataoff, tcph, sender);
- /*
+ /*
* RFC 1323:
* Both sides must send the Window Scale option
* to enable window scaling in either direction.
*/
if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
&& receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
+ sender->td_scale =
receiver->td_scale = 0;
+ if (!tcph->ack)
+ /* Simultaneous open */
+ return true;
} else {
/*
* We are in the middle of a connection,
* its history is lost for us.
* Let's try to use the data from the packet.
- */
+ */
sender->td_end = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ swin = win << sender->td_scale;
+ sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
+ /*
+ * We haven't seen traffic in the other direction yet
+ * but we have to tweak window tracking to pass III
+ * and IV until that happens.
+ */
+ if (receiver->td_maxwin == 0)
+ receiver->td_end = receiver->td_maxend = sack;
}
} else if (((state->state == TCP_CONNTRACK_SYN_SENT
&& dir == IP_CT_DIR_ORIGINAL)
@@ -597,7 +594,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
&& after(end, sender->td_end)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
- * not wait at all; it must only be sure to use sequence
+ * not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
*/
sender->td_end =
@@ -612,8 +609,8 @@ static int tcp_in_window(struct ip_ct_tcp *state,
* If there is no ACK, just pretend it was set and OK.
*/
ack = sack = receiver->td_end;
- } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
- (TCP_FLAG_ACK|TCP_FLAG_RST))
+ } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+ (TCP_FLAG_ACK|TCP_FLAG_RST))
&& (ack == 0)) {
/*
* Broken TCP stacks, that set ACK in RST packets as well
@@ -622,42 +619,38 @@ static int tcp_in_window(struct ip_ct_tcp *state,
ack = sack = receiver->td_end;
}
- if (seq == end
- && (!tcph->rst
- || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+ if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
/*
- * Packets contains no data: we assume it is valid
- * and check the ack value only.
- * However RST segments are always validated by their
- * SEQ number, except when seq == 0 (reset sent answering
- * SYN.
+ * RST sent answering SYN.
*/
seq = end = sender->td_end;
- DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "seq=%u ack=%u sack =%u win=%u end=%u\n",
- NIPQUAD(iph->saddr), ntohs(tcph->source),
- NIPQUAD(iph->daddr), ntohs(tcph->dest),
- seq, ack, sack, win, end);
- DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
- DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- after(end, sender->td_end - receiver->td_maxwin - 1),
- before(sack, receiver->td_end + 1),
- after(ack, receiver->td_end - MAXACKWINDOW(sender)));
-
- if (sender->loose || receiver->loose ||
- (before(seq, sender->td_maxend + 1) &&
- after(end, sender->td_end - receiver->td_maxwin - 1) &&
- before(sack, receiver->td_end + 1) &&
- after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
- /*
+ pr_debug("tcp_in_window: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+ seq, ack, receiver_offset, sack, receiver_offset, win, end);
+ pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ /* Is the ending sequence in the receive window (if available)? */
+ in_recv_win = !receiver->td_maxwin ||
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+ before(seq, sender->td_maxend + 1),
+ (in_recv_win ? 1 : 0),
+ before(sack, receiver->td_end + 1),
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+
+ if (before(seq, sender->td_maxend + 1) &&
+ in_recv_win &&
+ before(sack, receiver->td_end + 1) &&
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
+ /*
* Take into account window scaling (RFC 1323).
*/
if (!tcph->syn)
@@ -669,181 +662,140 @@ static int tcp_in_window(struct ip_ct_tcp *state,
swin = win + (sack - ack);
if (sender->td_maxwin < swin)
sender->td_maxwin = swin;
- if (after(end, sender->td_end))
+ if (after(end, sender->td_end)) {
sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack))
+ sender->td_maxack = ack;
+ }
+
/*
* Update receiver data.
*/
- if (after(end, sender->td_maxend))
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
receiver->td_maxwin += end - sender->td_maxend;
if (after(sack + win, receiver->td_maxend - 1)) {
receiver->td_maxend = sack + win;
if (win == 0)
receiver->td_maxend++;
}
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
- /*
+ /*
* Check retransmissions.
*/
if (index == TCP_ACK_SET) {
if (state->last_dir == dir
&& state->last_seq == seq
&& state->last_ack == ack
- && state->last_end == end)
+ && state->last_end == end
+ && state->last_win == win)
state->retrans++;
else {
state->last_dir = dir;
state->last_seq = seq;
state->last_ack = ack;
state->last_end = end;
+ state->last_win = win;
state->retrans = 0;
}
}
- /*
- * Close the window of disabled window tracking :-)
- */
- if (sender->loose)
- sender->loose--;
-
- res = 1;
+ res = true;
} else {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ res = false;
+ if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
+ tn->tcp_be_liberal)
+ res = true;
+ if (!res && LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
- after(end, sender->td_end - receiver->td_maxwin - 1) ?
+ in_recv_win ?
before(sack, receiver->td_end + 1) ?
- after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
: "ACK is under the lower bound (possible overly delayed ACK)"
: "ACK is over the upper bound (ACKed data not seen yet)"
: "SEQ is under the lower bound (already ACKed data retransmitted)"
: "SEQ is over the upper bound (over the window of the receiver)");
+ }
- res = nf_ct_tcp_be_liberal;
- }
-
- DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+ pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
+ "receiver end=%u maxend=%u maxwin=%u\n",
+ res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
return res;
}
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-/* Update sender->td_end after NAT successfully mangled the packet */
-/* Caller must linearize skb at tcp header. */
-void nf_conntrack_tcp_update(struct sk_buff *skb,
- unsigned int dataoff,
- struct nf_conn *conntrack,
- int dir)
-{
- struct tcphdr *tcph = (void *)skb->data + dataoff;
- __u32 end;
-#ifdef DEBUGP_VARS
- struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
- struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
-#endif
-
- end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
-
- write_lock_bh(&tcp_lock);
- /*
- * We have to worry for the ack in the reply packet only...
- */
- if (after(end, conntrack->proto.tcp.seen[dir].td_end))
- conntrack->proto.tcp.seen[dir].td_end = end;
- conntrack->proto.tcp.last_end = end;
- write_unlock_bh(&tcp_lock);
- DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-}
-
-#endif
-
-#define TH_FIN 0x01
-#define TH_SYN 0x02
-#define TH_RST 0x04
-#define TH_PUSH 0x08
-#define TH_ACK 0x10
-#define TH_URG 0x20
-#define TH_ECE 0x40
-#define TH_CWR 0x80
-
-/* table of valid flag combinations - ECE and CWR are always valid */
-static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
+static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
+ TCPHDR_URG) + 1] =
{
- [TH_SYN] = 1,
- [TH_SYN|TH_ACK] = 1,
- [TH_SYN|TH_PUSH] = 1,
- [TH_SYN|TH_ACK|TH_PUSH] = 1,
- [TH_RST] = 1,
- [TH_RST|TH_ACK] = 1,
- [TH_RST|TH_ACK|TH_PUSH] = 1,
- [TH_FIN|TH_ACK] = 1,
- [TH_ACK] = 1,
- [TH_ACK|TH_PUSH] = 1,
- [TH_ACK|TH_URG] = 1,
- [TH_ACK|TH_URG|TH_PUSH] = 1,
- [TH_FIN|TH_ACK|TH_PUSH] = 1,
- [TH_FIN|TH_ACK|TH_URG] = 1,
- [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
+ [TCPHDR_SYN] = 1,
+ [TCPHDR_SYN|TCPHDR_URG] = 1,
+ [TCPHDR_SYN|TCPHDR_ACK] = 1,
+ [TCPHDR_RST] = 1,
+ [TCPHDR_RST|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1,
+ [TCPHDR_ACK] = 1,
+ [TCPHDR_ACK|TCPHDR_URG] = 1,
};
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
-static int tcp_error(struct sk_buff *skb,
+static int tcp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum,
- int(*csum)(const struct sk_buff *,unsigned int))
+ u_int8_t pf,
+ unsigned int hooknum)
{
- struct tcphdr _tcph, *th;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
unsigned int tcplen = skb->len - dataoff;
u_int8_t tcpflags;
/* Smaller that minimal TCP header? */
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
if (th == NULL) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: short packet ");
return -NF_ACCEPT;
- }
-
+ }
+
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
-
+
/* Checksum invalid? Ignore.
* We skip checking packets on the outgoing path
- * because the semantic of CHECKSUM_HW is different there
- * and moreover root might send raw packets.
+ * because the checksum is assumed to be correct.
*/
/* FIXME: Source route IP option packets --RR */
- if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
- (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
- && skb->ip_summed != CHECKSUM_UNNECESSARY
- && csum(skb, dataoff)) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
/* Check TCP flags. */
- tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -851,145 +803,190 @@ static int tcp_error(struct sk_buff *skb,
return NF_ACCEPT;
}
-static int csum4(const struct sk_buff *skb, unsigned int dataoff)
+static unsigned int *tcp_get_timeouts(struct net *net)
{
- return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->len - dataoff, IPPROTO_TCP,
- skb->ip_summed == CHECKSUM_HW ? skb->csum
- : skb_checksum(skb, dataoff,
- skb->len - dataoff, 0));
-}
-
-static int csum6(const struct sk_buff *skb, unsigned int dataoff)
-{
- return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
- skb->len - dataoff, IPPROTO_TCP,
- skb->ip_summed == CHECKSUM_HW
- ? csum_sub(skb->csum,
- skb_checksum(skb, 0, dataoff, 0))
- : skb_checksum(skb, dataoff, skb->len - dataoff,
- 0));
-}
-
-static int tcp_error4(struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum)
-{
- return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
-}
-
-static int tcp_error6(struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum)
-{
- return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
+ return tcp_pernet(net)->timeouts;
}
/* Returns verdict for packet, or -1 for invalid. */
-static int tcp_packet(struct nf_conn *conntrack,
+static int tcp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
enum ip_conntrack_dir dir;
- struct tcphdr *th, _tcph;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
unsigned long timeout;
unsigned int index;
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
- write_lock_bh(&tcp_lock);
- old_state = conntrack->proto.tcp.state;
+ spin_lock_bh(&ct->lock);
+ old_state = ct->proto.tcp.state;
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
new_state = tcp_conntracks[dir][index][old_state];
+ tuple = &ct->tuplehash[dir].tuple;
switch (new_state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ break;
+ /* RFC 1122: "When a connection is closed actively,
+ * it MUST linger in TIME-WAIT state for a time 2xMSL
+ * (Maximum Segment Lifetime). However, it MAY accept
+ * a new SYN from the remote TCP to reopen the connection
+ * directly from TIME-WAIT state, if..."
+ * We ignore the conditions because we are in the
+ * TIME-WAIT state anyway.
+ *
+ * Handle aborted connections: we and the server
+ * think there is an existing connection but the client
+ * aborts it and starts a new one.
+ */
+ if (((ct->proto.tcp.seen[dir].flags
+ | ct->proto.tcp.seen[!dir].flags)
+ & IP_CT_TCP_FLAG_CLOSE_INIT)
+ || (ct->proto.tcp.last_dir == dir
+ && ct->proto.tcp.last_index == TCP_RST_SET)) {
+ /* Attempt to reopen a closed/aborted connection.
+ * Delete this connection and look up again. */
+ spin_unlock_bh(&ct->lock);
+
+ /* Only repeat if we can actually remove the timer.
+ * Destruction may already be in progress in process
+ * context and we must give it a chance to terminate.
+ */
+ if (nf_ct_kill(ct))
+ return -NF_REPEAT;
+ return NF_DROP;
+ }
+ /* Fall through */
case TCP_CONNTRACK_IGNORE:
/* Ignored packets:
*
+ * Our connection entry may be out of sync, so ignore
+ * packets which may signal the real connection between
+ * the client and the server.
+ *
* a) SYN in ORIGINAL
* b) SYN/ACK in REPLY
- * c) ACK in reply direction after initial SYN in original.
+ * c) ACK in reply direction after initial SYN in original.
+ *
+ * If the ignored packet is invalid, the receiver will send
+ * a RST we'll catch below.
*/
if (index == TCP_SYNACK_SET
- && conntrack->proto.tcp.last_index == TCP_SYN_SET
- && conntrack->proto.tcp.last_dir != dir
- && ntohl(th->ack_seq) ==
- conntrack->proto.tcp.last_end) {
- /* This SYN/ACK acknowledges a SYN that we earlier
+ && ct->proto.tcp.last_index == TCP_SYN_SET
+ && ct->proto.tcp.last_dir != dir
+ && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
+ /* b) This SYN/ACK acknowledges a SYN that we earlier
* ignored as invalid. This means that the client and
* the server are both in sync, while the firewall is
- * not. We kill this session and block the SYN/ACK so
- * that the client cannot but retransmit its SYN and
- * thus initiate a clean new session.
+ * not. We get in sync from the previously annotated
+ * values.
*/
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: killing out of sync session ");
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
- return -NF_DROP;
+ old_state = TCP_CONNTRACK_SYN_SENT;
+ new_state = TCP_CONNTRACK_SYN_RECV;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+ ct->proto.tcp.last_end;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+ ct->proto.tcp.last_end;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+ ct->proto.tcp.last_win == 0 ?
+ 1 : ct->proto.tcp.last_win;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+ ct->proto.tcp.last_wscale;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+ ct->proto.tcp.last_flags;
+ memset(&ct->proto.tcp.seen[dir], 0,
+ sizeof(struct ip_ct_tcp_state));
+ break;
}
- conntrack->proto.tcp.last_index = index;
- conntrack->proto.tcp.last_dir = dir;
- conntrack->proto.tcp.last_seq = ntohl(th->seq);
- conntrack->proto.tcp.last_end =
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ ct->proto.tcp.last_seq = ntohl(th->seq);
+ ct->proto.tcp.last_end =
segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
-
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: invalid packed ignored ");
+ ct->proto.tcp.last_win = ntohs(th->window);
+
+ /* a) This is a SYN in ORIGINAL. The client and the server
+ * may be in sync but we are not. In that case, we annotate
+ * the TCP options and let the packet go through. If it is a
+ * valid SYN packet, the server will reply with a SYN/ACK, and
+ * then we'll get in sync. Otherwise, the server ignores it. */
+ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+ struct ip_ct_tcp_state seen = {};
+
+ ct->proto.tcp.last_flags =
+ ct->proto.tcp.last_wscale = 0;
+ tcp_options(skb, dataoff, th, &seen);
+ if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+ ct->proto.tcp.last_flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ ct->proto.tcp.last_wscale = seen.td_scale;
+ }
+ if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+ ct->proto.tcp.last_flags |=
+ IP_CT_TCP_FLAG_SACK_PERM;
+ }
+ }
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid packet ignored in "
+ "state %s ", tcp_conntrack_names[old_state]);
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
+ /* Special case for SYN proxy: when the SYN to the server or
+ * the SYN/ACK from the server is lost, the client may transmit
+ * a keep-alive packet while in SYN_SENT state. This needs to
+ * be associated with the original conntrack entry in order to
+ * generate a new SYN with the correct sequence number.
+ */
+ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
+ index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
+ pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ }
+
/* Invalid packet */
- DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th),
- old_state);
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+ dir, get_conntrack_index(th), old_state);
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
return -NF_ACCEPT;
- case TCP_CONNTRACK_SYN_SENT:
- if (old_state < TCP_CONNTRACK_TIME_WAIT)
- break;
- if ((conntrack->proto.tcp.seen[dir].flags &
- IP_CT_TCP_FLAG_CLOSE_INIT)
- || after(ntohl(th->seq),
- conntrack->proto.tcp.seen[dir].td_end)) {
- /* Attempt to reopen a closed connection.
- * Delete this connection and look up again. */
- write_unlock_bh(&tcp_lock);
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
- return -NF_REPEAT;
- } else {
- write_unlock_bh(&tcp_lock);
- if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL,
- NULL, "nf_ct_tcp: invalid SYN");
+ case TCP_CONNTRACK_CLOSE:
+ if (index == TCP_RST_SET
+ && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+ && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+ /* Invalid RST */
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid RST ");
return -NF_ACCEPT;
}
- case TCP_CONNTRACK_CLOSE:
if (index == TCP_RST_SET
- && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_SYN_SET)
- || (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
- && conntrack->proto.tcp.last_index == TCP_ACK_SET))
- && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
+ && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
+ && ct->proto.tcp.last_index == TCP_SYN_SET)
+ || (!test_bit(IPS_ASSURED_BIT, &ct->status)
+ && ct->proto.tcp.last_index == TCP_ACK_SET))
+ && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
/* RST sent to invalid SYN or ACK we had let through
* at a) and c) above:
*
@@ -1007,247 +1004,707 @@ static int tcp_packet(struct nf_conn *conntrack,
break;
}
- if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
+ if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
skb, dataoff, th, pf)) {
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->lock);
return -NF_ACCEPT;
}
in_window:
/* From now on we have got in-window packets */
- conntrack->proto.tcp.last_index = index;
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
- DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
- "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
- NIPQUAD(iph->saddr), ntohs(th->source),
- NIPQUAD(iph->daddr), ntohs(th->dest),
- (th->syn ? 1 : 0), (th->ack ? 1 : 0),
- (th->fin ? 1 : 0), (th->rst ? 1 : 0),
- old_state, new_state);
+ pr_debug("tcp_conntracks: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+ (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+ (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+ old_state, new_state);
- conntrack->proto.tcp.state = new_state;
+ ct->proto.tcp.state = new_state;
if (old_state != new_state
- && (new_state == TCP_CONNTRACK_FIN_WAIT
- || new_state == TCP_CONNTRACK_CLOSE))
- conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
- timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans
- && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans
- ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
- write_unlock_bh(&tcp_lock);
-
- nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ && new_state == TCP_CONNTRACK_FIN_WAIT)
+ ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
+ timeout = timeouts[TCP_CONNTRACK_RETRANS];
+ else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
+ IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
+ else
+ timeout = timeouts[new_state];
+ spin_unlock_bh(&ct->lock);
+
if (new_state != old_state)
- nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
- if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
problem case, so we can delete the conntrack
immediately. --RR */
if (th->rst) {
- if (del_timer(&conntrack->timeout))
- conntrack->timeout.function((unsigned long)
- conntrack);
+ nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
- } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+ /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
+ * pickup with loose=1. Avoid large ESTABLISHED timeout.
+ */
+ if (new_state == TCP_CONNTRACK_ESTABLISHED &&
+ timeout > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
+ } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
- /* Set ASSURED if we see see valid ack in ESTABLISHED
- after SYN_RECV or a valid answer for a picked up
+ /* Set ASSURED if we see see valid ack in ESTABLISHED
+ after SYN_RECV or a valid answer for a picked up
connection. */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
- nf_conntrack_event_cache(IPCT_STATUS, skb);
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
}
- nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
return NF_ACCEPT;
}
-
+
/* Called when a new connection for this protocol found. */
-static int tcp_new(struct nf_conn *conntrack,
- const struct sk_buff *skb,
- unsigned int dataoff)
+static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
{
enum tcp_conntrack new_state;
- struct tcphdr *th, _tcph;
-#ifdef DEBUGP_VARS
- struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
- struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
-#endif
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
+ const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
/* Don't need lock here: this conntrack not in circulation yet */
- new_state
- = tcp_conntracks[0][get_conntrack_index(th)]
- [TCP_CONNTRACK_NONE];
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
- DEBUGP("nf_ct_tcp: invalid new deleting.\n");
- return 0;
+ pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ return false;
}
if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/* SYN packet */
- conntrack->proto.tcp.seen[0].td_end =
+ ct->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
dataoff, th);
- conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
- conntrack->proto.tcp.seen[0].td_maxwin = 1;
- conntrack->proto.tcp.seen[0].td_maxend =
- conntrack->proto.tcp.seen[0].td_end;
-
- tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]);
- conntrack->proto.tcp.seen[1].flags = 0;
- conntrack->proto.tcp.seen[0].loose =
- conntrack->proto.tcp.seen[1].loose = 0;
- } else if (nf_ct_tcp_loose == 0) {
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end;
+
+ tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+ } else if (tn->tcp_loose == 0) {
/* Don't try to pick up connections. */
- return 0;
+ return false;
} else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/*
* We are in the middle of a connection,
* its history is lost for us.
* Let's try to use the data from the packet.
*/
- conntrack->proto.tcp.seen[0].td_end =
+ ct->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
dataoff, th);
- conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
- conntrack->proto.tcp.seen[0].td_maxwin = 1;
- conntrack->proto.tcp.seen[0].td_maxend =
- conntrack->proto.tcp.seen[0].td_end +
- conntrack->proto.tcp.seen[0].td_maxwin;
- conntrack->proto.tcp.seen[0].td_scale = 0;
-
- /* We assume SACK. Should we assume window scaling too? */
- conntrack->proto.tcp.seen[0].flags =
- conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
- conntrack->proto.tcp.seen[0].loose =
- conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose;
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end +
+ ct->proto.tcp.seen[0].td_maxwin;
+
+ /* We assume SACK and liberal window checking to handle
+ * window scaling */
+ ct->proto.tcp.seen[0].flags =
+ ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
+ IP_CT_TCP_FLAG_BE_LIBERAL;
}
-
- conntrack->proto.tcp.seen[1].td_end = 0;
- conntrack->proto.tcp.seen[1].td_maxend = 0;
- conntrack->proto.tcp.seen[1].td_maxwin = 1;
- conntrack->proto.tcp.seen[1].td_scale = 0;
/* tcp_packet will set them */
- conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
- conntrack->proto.tcp.last_index = TCP_NONE_SET;
-
- DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
- return 1;
+ ct->proto.tcp.last_index = TCP_NONE_SET;
+
+ pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+ return true;
}
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
-static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
- const struct nf_conn *ct)
+static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
{
- struct nfattr *nest_parms;
-
- read_lock_bh(&tcp_lock);
- nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
- NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
- &ct->proto.tcp.state);
- read_unlock_bh(&tcp_lock);
-
- NFA_NEST_END(skb, nest_parms);
+ struct nlattr *nest_parms;
+ struct nf_ct_tcp_flags tmp = {};
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ ct->proto.tcp.seen[0].td_scale) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
+ ct->proto.tcp.seen[1].td_scale))
+ goto nla_put_failure;
+
+ tmp.flags = ct->proto.tcp.seen[0].flags;
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
+
+ tmp.flags = ct->proto.tcp.seen[1].flags;
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
+ spin_unlock_bh(&ct->lock);
+
+ nla_nest_end(skb, nest_parms);
return 0;
-nfattr_failure:
- read_unlock_bh(&tcp_lock);
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
return -1;
}
-static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
- [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t),
+static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
+ [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
+ [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
};
-static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct)
+static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
{
- struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
- struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
+ struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
+ struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
+ int err;
/* updates could not contain anything about the private
* protocol info, in that case skip the parsing */
- if (!attr)
+ if (!pattr)
return 0;
- nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
+ err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
+ if (err < 0)
+ return err;
- if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
+ if (tb[CTA_PROTOINFO_TCP_STATE] &&
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
return -EINVAL;
- if (!tb[CTA_PROTOINFO_TCP_STATE-1])
- return -EINVAL;
+ spin_lock_bh(&ct->lock);
+ if (tb[CTA_PROTOINFO_TCP_STATE])
+ ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
+
+ if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
+ struct nf_ct_tcp_flags *attr =
+ nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
+ ct->proto.tcp.seen[0].flags &= ~attr->mask;
+ ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
+ }
+
+ if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
+ struct nf_ct_tcp_flags *attr =
+ nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
+ ct->proto.tcp.seen[1].flags &= ~attr->mask;
+ ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
+ }
+
+ if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
+ tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
+ ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+ ct->proto.tcp.seen[0].td_scale =
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
+ ct->proto.tcp.seen[1].td_scale =
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
+ }
+ spin_unlock_bh(&ct->lock);
+
+ return 0;
+}
+
+static int tcp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_TCP */
+ + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
+}
+
+static int tcp_nlattr_tuple_size(void)
+{
+ return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- write_lock_bh(&tcp_lock);
- ct->proto.tcp.state =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
- write_unlock_bh(&tcp_lock);
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ int i;
+
+ /* set default TCP timeouts. */
+ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ timeouts[i] = tn->timeouts[i];
+
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
+ timeouts[TCP_CONNTRACK_SYN_RECV] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
+ timeouts[TCP_CONNTRACK_ESTABLISHED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
+ timeouts[TCP_CONNTRACK_FIN_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
+ timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
+ timeouts[TCP_CONNTRACK_LAST_ACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
+ timeouts[TCP_CONNTRACK_TIME_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
+ timeouts[TCP_CONNTRACK_CLOSE] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT2] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
+ timeouts[TCP_CONNTRACK_RETRANS] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_UNACK]) {
+ timeouts[TCP_CONNTRACK_UNACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
+ }
+ return 0;
+}
+
+static int
+tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
+ htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
+ htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
+ htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
+ htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
+ htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
+ goto nla_put_failure;
return 0;
+
+nla_put_failure:
+ return -ENOSPC;
}
+
+static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
+ [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tcp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_tcp_timeout_syn_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_syn_recv",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_fin_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_close_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_last_ack",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_time_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_close",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_unacknowledged",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_loose",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_tcp_be_liberal",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_tcp_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table tcp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_sent2",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_recv",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_fin_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_close_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_last_ack",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_time_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_close",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_loose",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_tcp_be_liberal",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_tcp_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(tcp_sysctl_table,
+ sizeof(tcp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK];
+ pn->ctl_table[10].data = &tn->tcp_loose;
+ pn->ctl_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_table[12].data = &tn->tcp_max_retrans;
+#endif
+ return 0;
+}
+
+static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
+ sizeof(tcp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
+ pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_compat_table[10].data = &tn->tcp_loose;
+ pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
+#endif
#endif
-
-struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 =
+ return 0;
+}
+
+static int tcp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct nf_proto_net *pn = &tn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ tn->timeouts[i] = tcp_timeouts[i];
+
+ tn->tcp_loose = nf_ct_tcp_loose;
+ tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
+ tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+ }
+
+ if (proto == AF_INET) {
+ ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
+ if (ret < 0)
+ return ret;
+
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+
+ return ret;
+}
+
+static struct nf_proto_net *tcp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
{
.l3proto = PF_INET,
- .proto = IPPROTO_TCP,
+ .l4proto = IPPROTO_TCP,
.name = "tcp",
.pkt_to_tuple = tcp_pkt_to_tuple,
.invert_tuple = tcp_invert_tuple,
.print_tuple = tcp_print_tuple,
.print_conntrack = tcp_print_conntrack,
.packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
- .error = tcp_error4,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .to_nfattr = tcp_to_nfattr,
- .from_nfattr = nfattr_to_tcp,
- .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple,
+ .error = tcp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = tcp_to_nlattr,
+ .nlattr_size = tcp_nlattr_size,
+ .from_nlattr = nlattr_to_tcp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = tcp_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
-struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
{
.l3proto = PF_INET6,
- .proto = IPPROTO_TCP,
+ .l4proto = IPPROTO_TCP,
.name = "tcp",
.pkt_to_tuple = tcp_pkt_to_tuple,
.invert_tuple = tcp_invert_tuple,
.print_tuple = tcp_print_tuple,
.print_conntrack = tcp_print_conntrack,
.packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
- .error = tcp_error6,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .to_nfattr = tcp_to_nfattr,
- .from_nfattr = nfattr_to_tcp,
- .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple,
+ .error = tcp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = tcp_to_nlattr,
+ .nlattr_size = tcp_nlattr_size,
+ .from_nlattr = nlattr_to_tcp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = tcp_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
};
-
-EXPORT_SYMBOL(nf_conntrack_protocol_tcp4);
-EXPORT_SYMBOL(nf_conntrack_protocol_tcp6);
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 831d206344e..9d7721cbce4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -1,58 +1,65 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - enable working with Layer 3 protocol independent connection tracking.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c
*/
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/module.h>
-#include <linux/netfilter.h>
#include <linux/udp.h>
#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
#include <net/checksum.h>
+
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-unsigned int nf_ct_udp_timeout = 30*HZ;
-unsigned int nf_ct_udp_timeout_stream = 180*HZ;
+static unsigned int udp_timeouts[UDP_CT_MAX] = {
+ [UDP_CT_UNREPLIED] = 30*HZ,
+ [UDP_CT_REPLIED] = 180*HZ,
+};
-static int udp_pkt_to_tuple(const struct sk_buff *skb,
+static inline struct nf_udp_net *udp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp;
+}
+
+static bool udp_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
{
- struct udphdr _hdr, *hp;
+ const struct udphdr *hp;
+ struct udphdr _hdr;
/* Actually only need first 8 bytes. */
hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hp == NULL)
- return 0;
+ return false;
tuple->src.u.udp.port = hp->source;
tuple->dst.u.udp.port = hp->dest;
- return 1;
+ return true;
}
-static int udp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
tuple->src.u.udp.port = orig->dst.u.udp.port;
tuple->dst.u.udp.port = orig->src.u.udp.port;
- return 1;
+ return true;
}
/* Print out the per-protocol part of the tuple. */
@@ -64,64 +71,64 @@ static int udp_print_tuple(struct seq_file *s,
ntohs(tuple->dst.u.udp.port));
}
-/* Print out the private part of the conntrack. */
-static int udp_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
+static unsigned int *udp_get_timeouts(struct net *net)
{
- return 0;
+ return udp_pernet(net)->timeouts;
}
/* Returns verdict for packet, and may modify conntracktype */
-static int udp_packet(struct nf_conn *conntrack,
+static int udp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- int pf,
- unsigned int hooknum)
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
{
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
- nf_ct_refresh_acct(conntrack, ctinfo, skb,
- nf_ct_udp_timeout_stream);
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_REPLIED]);
/* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
- nf_conntrack_event_cache(IPCT_STATUS, skb);
- } else
- nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout);
-
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_UNREPLIED]);
+ }
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
-static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
- unsigned int dataoff)
+static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
{
- return 1;
+ return true;
}
-static int udp_error(struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum,
- int (*csum)(const struct sk_buff *, unsigned int))
+static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum)
{
unsigned int udplen = skb->len - dataoff;
- struct udphdr _hdr, *hdr;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
/* Header is too small? */
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: short packet ");
return -NF_ACCEPT;
}
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -132,15 +139,12 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
/* Checksum invalid? Ignore.
* We skip checking packets on the outgoing path
- * because the semantic of CHECKSUM_HW is different there
- * and moreover root might send raw packets.
+ * because the checksum is assumed to be correct.
* FIXME: Source route IP option packets --RR */
- if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
- (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
- && skb->ip_summed != CHECKSUM_UNNECESSARY
- && csum(skb, dataoff)) {
- if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
@@ -148,81 +152,217 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
return NF_ACCEPT;
}
-static int csum4(const struct sk_buff *skb, unsigned int dataoff)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
{
- return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
- skb->len - dataoff, IPPROTO_UDP,
- skb->ip_summed == CHECKSUM_HW ? skb->csum
- : skb_checksum(skb, dataoff,
- skb->len - dataoff, 0));
+ unsigned int *timeouts = data;
+ struct nf_udp_net *un = udp_pernet(net);
+
+ /* set default timeouts for UDP. */
+ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
+ timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) {
+ timeouts[UDP_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDP_REPLIED]) {
+ timeouts[UDP_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ;
+ }
+ return 0;
}
-static int csum6(const struct sk_buff *skb, unsigned int dataoff)
+static int
+udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
{
- return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
- skb->len - dataoff, IPPROTO_UDP,
- skb->ip_summed == CHECKSUM_HW
- ? csum_sub(skb->csum,
- skb_checksum(skb, 0, dataoff, 0))
- : skb_checksum(skb, dataoff, skb->len - dataoff,
- 0));
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED,
+ htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED,
+ htonl(timeouts[UDP_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
}
-static int udp_error4(struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum)
+static const struct nla_policy
+udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
+ [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table udp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_udp_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table udp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_udp_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
{
- return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+ pn->ctl_table = kmemdup(udp_sysctl_table,
+ sizeof(udp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+ pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+ return 0;
}
-static int udp_error6(struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info *ctinfo,
- int pf,
- unsigned int hooknum)
+static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
{
- return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
+ sizeof(udp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+#endif
+ return 0;
}
-struct nf_conntrack_protocol nf_conntrack_protocol_udp4 =
+static int udp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_udp_net *un = udp_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < UDP_CT_MAX; i++)
+ un->timeouts[i] = udp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = udp_kmemdup_compat_sysctl_table(pn, un);
+ if (ret < 0)
+ return ret;
+
+ ret = udp_kmemdup_sysctl_table(pn, un);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = udp_kmemdup_sysctl_table(pn, un);
+
+ return ret;
+}
+
+static struct nf_proto_net *udp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
{
.l3proto = PF_INET,
- .proto = IPPROTO_UDP,
+ .l4proto = IPPROTO_UDP,
.name = "udp",
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
- .print_conntrack = udp_print_conntrack,
.packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
.new = udp_new,
- .error = udp_error4,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple,
+ .error = udp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
-struct nf_conntrack_protocol nf_conntrack_protocol_udp6 =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
{
.l3proto = PF_INET6,
- .proto = IPPROTO_UDP,
+ .l4proto = IPPROTO_UDP,
.name = "udp",
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
- .print_conntrack = udp_print_conntrack,
.packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
.new = udp_new,
- .error = udp_error6,
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr,
- .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple,
+ .error = udp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
};
-
-EXPORT_SYMBOL(nf_conntrack_protocol_udp4);
-EXPORT_SYMBOL(nf_conntrack_protocol_udp6);
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
new file mode 100644
index 00000000000..2750e6c69f8
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -0,0 +1,405 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+enum udplite_conntrack {
+ UDPLITE_CT_UNREPLIED,
+ UDPLITE_CT_REPLIED,
+ UDPLITE_CT_MAX
+};
+
+static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
+ [UDPLITE_CT_UNREPLIED] = 30*HZ,
+ [UDPLITE_CT_REPLIED] = 180*HZ,
+};
+
+static int udplite_net_id __read_mostly;
+struct udplite_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[UDPLITE_CT_MAX];
+};
+
+static inline struct udplite_net *udplite_pernet(struct net *net)
+{
+ return net_generic(net, udplite_net_id);
+}
+
+static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct udphdr *hp;
+ struct udphdr _hdr;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->src.u.udp.port = hp->source;
+ tuple->dst.u.udp.port = hp->dest;
+ return true;
+}
+
+static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udplite_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+static unsigned int *udplite_get_timeouts(struct net *net)
+{
+ return udplite_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udplite_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDPLITE_CT_REPLIED]);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDPLITE_CT_UNREPLIED]);
+ }
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ return true;
+}
+
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum)
+{
+ unsigned int udplen = skb->len - dataoff;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
+ unsigned int cscov;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hdr == NULL) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ cscov = ntohs(hdr->len);
+ if (cscov == 0)
+ cscov = udplen;
+ else if (cscov < sizeof(*hdr) || cscov > udplen) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: invalid checksum coverage ");
+ return -NF_ACCEPT;
+ }
+
+ /* UDPLITE mandates checksums */
+ if (!hdr->check) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: checksum missing ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore. */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+ pf)) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: bad UDPLite checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct udplite_net *un = udplite_pernet(net);
+
+ /* set default timeouts for UDPlite. */
+ timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
+ timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
+ timeouts[UDPLITE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
+ timeouts[UDPLITE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
+ htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
+ htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
+ [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table udplite_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_udplite_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_udplite_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct udplite_net *un)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(udplite_sysctl_table,
+ sizeof(udplite_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
+#endif
+ return 0;
+}
+
+static int udplite_init_net(struct net *net, u_int16_t proto)
+{
+ struct udplite_net *un = udplite_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0 ; i < UDPLITE_CT_MAX; i++)
+ un->timeouts[i] = udplite_timeouts[i];
+ }
+
+ return udplite_kmemdup_sysctl_table(pn, un);
+}
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .pkt_to_tuple = udplite_pkt_to_tuple,
+ .invert_tuple = udplite_invert_tuple,
+ .print_tuple = udplite_print_tuple,
+ .packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
+ .new = udplite_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .pkt_to_tuple = udplite_pkt_to_tuple,
+ .invert_tuple = udplite_invert_tuple,
+ .print_tuple = udplite_print_tuple,
+ .packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
+ .new = udplite_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
+};
+
+static int udplite_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
+ goto cleanup_udplite4;
+ }
+ return 0;
+
+cleanup_udplite4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+out:
+ return ret;
+}
+
+static void udplite_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+}
+
+static struct pernet_operations udplite_net_ops = {
+ .init = udplite_net_init,
+ .exit = udplite_net_exit,
+ .id = &udplite_net_id,
+ .size = sizeof(struct udplite_net),
+};
+
+static int __init nf_conntrack_proto_udplite_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&udplite_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
+ if (ret < 0)
+ goto out_udplite4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
+ if (ret < 0)
+ goto out_udplite6;
+
+ return 0;
+out_udplite6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+out_udplite4:
+ unregister_pernet_subsys(&udplite_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_conntrack_proto_udplite_exit(void)
+{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+ unregister_pernet_subsys(&udplite_net_ops);
+}
+
+module_init(nf_conntrack_proto_udplite_init);
+module_exit(nf_conntrack_proto_udplite_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
new file mode 100644
index 00000000000..4a2134fd3fc
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -0,0 +1,237 @@
+/* SANE connection tracking helper
+ * (SANE = Scanner Access Now Easy)
+ * For documentation about the SANE network protocol see
+ * http://www.sane-project.org/html/doc015.html
+ */
+
+/* Copyright (C) 2007 Red Hat, Inc.
+ * Author: Michal Schmidt <mschmidt@redhat.com>
+ * Based on the FTP conntrack helper (net/netfilter/nf_conntrack_ftp.c):
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2003 Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_sane.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
+MODULE_DESCRIPTION("SANE connection tracking helper");
+MODULE_ALIAS_NFCT_HELPER("sane");
+
+static char *sane_buffer;
+
+static DEFINE_SPINLOCK(nf_sane_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+struct sane_request {
+ __be32 RPC_code;
+#define SANE_NET_START 7 /* RPC code */
+
+ __be32 handle;
+};
+
+struct sane_reply_net_start {
+ __be32 status;
+#define SANE_STATUS_SUCCESS 0
+
+ __be16 zero;
+ __be16 port;
+ /* other fields aren't interesting for conntrack */
+};
+
+static int help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ void *sb_ptr;
+ int ret = NF_ACCEPT;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ struct sane_request *req;
+ struct sane_reply_net_start *reply;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* Not a full tcp header? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ /* No data? */
+ dataoff = protoff + th->doff * 4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ datalen = skb->len - dataoff;
+
+ spin_lock_bh(&nf_sane_lock);
+ sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
+ BUG_ON(sb_ptr == NULL);
+
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ if (datalen != sizeof(struct sane_request))
+ goto out;
+
+ req = sb_ptr;
+ if (req->RPC_code != htonl(SANE_NET_START)) {
+ /* Not an interesting command */
+ ct_sane_info->state = SANE_STATE_NORMAL;
+ goto out;
+ }
+
+ /* We're interested in the next reply */
+ ct_sane_info->state = SANE_STATE_START_REQUESTED;
+ goto out;
+ }
+
+ /* Is it a reply to an uninteresting command? */
+ if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
+ goto out;
+
+ /* It's a reply to SANE_NET_START. */
+ ct_sane_info->state = SANE_STATE_NORMAL;
+
+ if (datalen < sizeof(struct sane_reply_net_start)) {
+ pr_debug("nf_ct_sane: NET_START reply too short\n");
+ goto out;
+ }
+
+ reply = sb_ptr;
+ if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
+ /* saned refused the command */
+ pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
+ ntohl(reply->status));
+ goto out;
+ }
+
+ /* Invalid saned reply? Ignore it. */
+ if (reply->zero != 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &reply->port);
+
+ pr_debug("nf_ct_sane: expect: ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ /* Can't expect this? Best to drop packet now. */
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+
+ nf_ct_expect_put(exp);
+
+out:
+ spin_unlock_bh(&nf_sane_lock);
+ return ret;
+}
+
+static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sane_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
+
+/* don't make this __exit, since it's called from __init ! */
+static void nf_conntrack_sane_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++) {
+ pr_debug("nf_ct_sane: unregistering helper for pf: %d "
+ "port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_helper_unregister(&sane[i][j]);
+ }
+ }
+
+ kfree(sane_buffer);
+}
+
+static int __init nf_conntrack_sane_init(void)
+{
+ int i, j = -1, ret = 0;
+
+ sane_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!sane_buffer)
+ return -ENOMEM;
+
+ if (ports_c == 0)
+ ports[ports_c++] = SANE_PORT;
+
+ /* FIXME should be configurable whether IPv4 and IPv6 connections
+ are tracked or not - YK */
+ for (i = 0; i < ports_c; i++) {
+ sane[i][0].tuple.src.l3num = PF_INET;
+ sane[i][1].tuple.src.l3num = PF_INET6;
+ for (j = 0; j < 2; j++) {
+ sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
+ sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+ sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
+ sane[i][j].expect_policy = &sane_exp_policy;
+ sane[i][j].me = THIS_MODULE;
+ sane[i][j].help = help;
+ if (ports[i] == SANE_PORT)
+ sprintf(sane[i][j].name, "sane");
+ else
+ sprintf(sane[i][j].name, "sane-%d", ports[i]);
+
+ pr_debug("nf_ct_sane: registering helper for pf: %d "
+ "port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ ret = nf_conntrack_helper_register(&sane[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_sane: failed to "
+ "register helper for pf: %d port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_sane_fini();
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(nf_conntrack_sane_init);
+module_exit(nf_conntrack_sane_fini);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
new file mode 100644
index 00000000000..f6e2ae91a80
--- /dev/null
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -0,0 +1,243 @@
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+
+int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_conn_seqadj *seqadj;
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ seqadj = nfct_seqadj(ct);
+ this_way = &seqadj->seq[dir];
+ this_way->offset_before = off;
+ this_way->offset_after = off;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);
+
+int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ __be32 seq, s32 off)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ if (unlikely(!seqadj)) {
+ WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
+ return 0;
+ }
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ spin_lock_bh(&ct->lock);
+ this_way = &seqadj->seq[dir];
+ if (this_way->offset_before == this_way->offset_after ||
+ before(this_way->correction_pos, ntohl(seq))) {
+ this_way->correction_pos = ntohl(seq);
+ this_way->offset_before = this_way->offset_after;
+ this_way->offset_after += off;
+ }
+ spin_unlock_bh(&ct->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);
+
+void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP)
+ return;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);
+
+/* Adjust one found SACK option including checksum correction */
+static void nf_ct_sack_block_adjust(struct sk_buff *skb,
+ struct tcphdr *tcph,
+ unsigned int sackoff,
+ unsigned int sackend,
+ struct nf_ct_seqadj *seq)
+{
+ while (sackoff < sackend) {
+ struct tcp_sack_block_wire *sack;
+ __be32 new_start_seq, new_end_seq;
+
+ sack = (void *)skb->data + sackoff;
+ if (after(ntohl(sack->start_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_after);
+ else
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_before);
+
+ if (after(ntohl(sack->end_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_after);
+ else
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_before);
+
+ pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+ ntohl(sack->start_seq), new_start_seq,
+ ntohl(sack->end_seq), new_end_seq);
+
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->start_seq, new_start_seq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->end_seq, new_end_seq, 0);
+ sack->start_seq = new_start_seq;
+ sack->end_seq = new_end_seq;
+ sackoff += sizeof(*sack);
+ }
+}
+
+/* TCP SACK sequence number adjustment */
+static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *tcph,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dir, optoff, optend;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + tcph->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ while (optoff < optend) {
+ /* Usually: option, length. */
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ /* no partial options */
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_SACK &&
+ op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+ ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+ nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
+ optoff+op[1],
+ &seqadj->seq[!dir]);
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+
+/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
+int nf_ct_seq_adjust(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct tcphdr *tcph;
+ __be32 newseq, newack;
+ s32 seqoff, ackoff;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way, *other_way;
+ int res;
+
+ this_way = &seqadj->seq[dir];
+ other_way = &seqadj->seq[!dir];
+
+ if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ return 0;
+
+ tcph = (void *)skb->data + protoff;
+ spin_lock_bh(&ct->lock);
+ if (after(ntohl(tcph->seq), this_way->correction_pos))
+ seqoff = this_way->offset_after;
+ else
+ seqoff = this_way->offset_before;
+
+ if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+ other_way->correction_pos))
+ ackoff = other_way->offset_after;
+ else
+ ackoff = other_way->offset_before;
+
+ newseq = htonl(ntohl(tcph->seq) + seqoff);
+ newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+ pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+ ntohl(newack));
+
+ tcph->seq = newseq;
+ tcph->ack_seq = newack;
+
+ res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+ spin_unlock_bh(&ct->lock);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);
+
+s32 nf_ct_seq_offset(const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ u32 seq)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way;
+
+ if (!seqadj)
+ return 0;
+
+ this_way = &seqadj->seq[dir];
+ return after(seq, this_way->correction_pos) ?
+ this_way->offset_after : this_way->offset_before;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
+
+static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_seqadj),
+ .align = __alignof__(struct nf_conn_seqadj),
+ .id = NF_CT_EXT_SEQADJ,
+};
+
+int nf_conntrack_seqadj_init(void)
+{
+ return nf_ct_extend_register(&nf_ct_seqadj_extend);
+}
+
+void nf_conntrack_seqadj_fini(void)
+{
+ nf_ct_extend_unregister(&nf_ct_seqadj_extend);
+}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
new file mode 100644
index 00000000000..4c3ba1c8d68
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -0,0 +1,1680 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+MODULE_ALIAS("ip_conntrack_sip");
+MODULE_ALIAS_NFCT_HELPER("sip");
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of SIP servers");
+
+static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+static int sip_direct_signalling __read_mostly = 1;
+module_param(sip_direct_signalling, int, 0600);
+MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar "
+ "only (default 1)");
+
+static int sip_direct_media __read_mostly = 1;
+module_param(sip_direct_media, int, 0600);
+MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "
+ "endpoints only (default 1)");
+
+const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
+
+static int string_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = 0;
+
+ while (dptr < limit && isalpha(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int digits_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = 0;
+ while (dptr < limit && isdigit(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int iswordc(const char c)
+{
+ if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+ (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+ c == '{' || c == '}' || c == '~')
+ return 1;
+ return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+ int len = 0;
+ while (dptr < limit && iswordc(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len, domain_len;
+
+ len = word_len(dptr, limit);
+ dptr += len;
+ if (!len || dptr == limit || *dptr != '@')
+ return len;
+ dptr++;
+ len++;
+
+ domain_len = word_len(dptr, limit);
+ if (!domain_len)
+ return 0;
+ return len + domain_len;
+}
+
+/* get media type + port length */
+static int media_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = string_len(ct, dptr, limit, shift);
+
+ dptr += len;
+ if (dptr >= limit || *dptr != ' ')
+ return 0;
+ len++;
+ dptr++;
+
+ return len + digits_len(ct, dptr, limit, shift);
+}
+
+static int sip_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit, bool delim)
+{
+ const char *end;
+ int ret;
+
+ if (!ct)
+ return 0;
+
+ memset(addr, 0, sizeof(*addr));
+ switch (nf_ct_l3num(ct)) {
+ case AF_INET:
+ ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ if (ret == 0)
+ return 0;
+ break;
+ case AF_INET6:
+ if (cp < limit && *cp == '[')
+ cp++;
+ else if (delim)
+ return 0;
+
+ ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ if (ret == 0)
+ return 0;
+
+ if (end < limit && *end == ']')
+ end++;
+ else if (delim)
+ return 0;
+ break;
+ default:
+ BUG();
+ }
+
+ if (endp)
+ *endp = end;
+ return 1;
+}
+
+/* skip ip address. returns its length. */
+static int epaddr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ union nf_inet_addr addr;
+ const char *aux = dptr;
+
+ if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) {
+ pr_debug("ip: %s parse failed.!\n", dptr);
+ return 0;
+ }
+
+ /* Port number */
+ if (*dptr == ':') {
+ dptr++;
+ dptr += digits_len(ct, dptr, limit, shift);
+ }
+ return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ const char *start = dptr;
+ int s = *shift;
+
+ /* Search for @, but stop at the end of the line.
+ * We are inside a sip: URI, so we don't need to worry about
+ * continuation lines. */
+ while (dptr < limit &&
+ *dptr != '@' && *dptr != '\r' && *dptr != '\n') {
+ (*shift)++;
+ dptr++;
+ }
+
+ if (dptr < limit && *dptr == '@') {
+ dptr++;
+ (*shift)++;
+ } else {
+ dptr = start;
+ *shift = s;
+ }
+
+ return epaddr_len(ct, dptr, limit, shift);
+}
+
+/* Parse a SIP request line of the form:
+ *
+ * Request-Line = Method SP Request-URI SP SIP-Version CRLF
+ *
+ * and return the offset and length of the address contained in the Request-URI.
+ */
+int ct_sip_parse_request(const struct nf_conn *ct,
+ const char *dptr, unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const char *start = dptr, *limit = dptr + datalen, *end;
+ unsigned int mlen;
+ unsigned int p;
+ int shift = 0;
+
+ /* Skip method and following whitespace */
+ mlen = string_len(ct, dptr, limit, NULL);
+ if (!mlen)
+ return 0;
+ dptr += mlen;
+ if (++dptr >= limit)
+ return 0;
+
+ /* Find SIP URI */
+ for (; dptr < limit - strlen("sip:"); dptr++) {
+ if (*dptr == '\r' || *dptr == '\n')
+ return -1;
+ if (strnicmp(dptr, "sip:", strlen("sip:")) == 0) {
+ dptr += strlen("sip:");
+ break;
+ }
+ }
+ if (!skp_epaddr_len(ct, dptr, limit, &shift))
+ return 0;
+ dptr += shift;
+
+ if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))
+ return -1;
+ if (end < limit && *end == ':') {
+ end++;
+ p = simple_strtoul(end, (char **)&end, 10);
+ if (p < 1024 || p > 65535)
+ return -1;
+ *port = htons(p);
+ } else
+ *port = htons(SIP_PORT);
+
+ if (end == dptr)
+ return 0;
+ *matchoff = dptr - start;
+ *matchlen = end - dptr;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_request);
+
+/* SIP header parsing: SIP headers are located at the beginning of a line, but
+ * may span several lines, in which case the continuation lines begin with a
+ * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or
+ * CRLF, RFC 3261 allows only CRLF, we support both.
+ *
+ * Headers are followed by (optionally) whitespace, a colon, again (optionally)
+ * whitespace and the values. Whitespace in this context means any amount of
+ * tabs, spaces and continuation lines, which are treated as a single whitespace
+ * character.
+ *
+ * Some headers may appear multiple times. A comma separated list of values is
+ * equivalent to multiple headers.
+ */
+static const struct sip_header ct_sip_hdrs[] = {
+ [SIP_HDR_CSEQ] = SIP_HDR("CSeq", NULL, NULL, digits_len),
+ [SIP_HDR_FROM] = SIP_HDR("From", "f", "sip:", skp_epaddr_len),
+ [SIP_HDR_TO] = SIP_HDR("To", "t", "sip:", skp_epaddr_len),
+ [SIP_HDR_CONTACT] = SIP_HDR("Contact", "m", "sip:", skp_epaddr_len),
+ [SIP_HDR_VIA_UDP] = SIP_HDR("Via", "v", "UDP ", epaddr_len),
+ [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
+ [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
+ [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
+ [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
+};
+
+static const char *sip_follow_continuation(const char *dptr, const char *limit)
+{
+ /* Walk past newline */
+ if (++dptr >= limit)
+ return NULL;
+
+ /* Skip '\n' in CR LF */
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ return NULL;
+ }
+
+ /* Continuation line? */
+ if (*dptr != ' ' && *dptr != '\t')
+ return NULL;
+
+ /* skip leading whitespace */
+ for (; dptr < limit; dptr++) {
+ if (*dptr != ' ' && *dptr != '\t')
+ break;
+ }
+ return dptr;
+}
+
+static const char *sip_skip_whitespace(const char *dptr, const char *limit)
+{
+ for (; dptr < limit; dptr++) {
+ if (*dptr == ' ')
+ continue;
+ if (*dptr != '\r' && *dptr != '\n')
+ break;
+ dptr = sip_follow_continuation(dptr, limit);
+ if (dptr == NULL)
+ return NULL;
+ }
+ return dptr;
+}
+
+/* Search within a SIP header value, dealing with continuation lines */
+static const char *ct_sip_header_search(const char *dptr, const char *limit,
+ const char *needle, unsigned int len)
+{
+ for (limit -= len; dptr < limit; dptr++) {
+ if (*dptr == '\r' || *dptr == '\n') {
+ dptr = sip_follow_continuation(dptr, limit);
+ if (dptr == NULL)
+ break;
+ continue;
+ }
+
+ if (strnicmp(dptr, needle, len) == 0)
+ return dptr;
+ }
+ return NULL;
+}
+
+int ct_sip_get_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdr = &ct_sip_hdrs[type];
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ for (dptr += dataoff; dptr < limit; dptr++) {
+ /* Find beginning of line */
+ if (*dptr != '\r' && *dptr != '\n')
+ continue;
+ if (++dptr >= limit)
+ break;
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ break;
+ }
+
+ /* Skip continuation lines */
+ if (*dptr == ' ' || *dptr == '\t')
+ continue;
+
+ /* Find header. Compact headers must be followed by a
+ * non-alphabetic character to avoid mismatches. */
+ if (limit - dptr >= hdr->len &&
+ strnicmp(dptr, hdr->name, hdr->len) == 0)
+ dptr += hdr->len;
+ else if (hdr->cname && limit - dptr >= hdr->clen + 1 &&
+ strnicmp(dptr, hdr->cname, hdr->clen) == 0 &&
+ !isalpha(*(dptr + hdr->clen)))
+ dptr += hdr->clen;
+ else
+ continue;
+
+ /* Find and skip colon */
+ dptr = sip_skip_whitespace(dptr, limit);
+ if (dptr == NULL)
+ break;
+ if (*dptr != ':' || ++dptr >= limit)
+ break;
+
+ /* Skip whitespace after colon */
+ dptr = sip_skip_whitespace(dptr, limit);
+ if (dptr == NULL)
+ break;
+
+ *matchoff = dptr - start;
+ if (hdr->search) {
+ dptr = ct_sip_header_search(dptr, limit, hdr->search,
+ hdr->slen);
+ if (!dptr)
+ return -1;
+ dptr += hdr->slen;
+ }
+
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff = dptr - start + shift;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_header);
+
+/* Get next header field in a list of comma separated values */
+static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdr = &ct_sip_hdrs[type];
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ dptr += dataoff;
+
+ dptr = ct_sip_header_search(dptr, limit, ",", strlen(","));
+ if (!dptr)
+ return 0;
+
+ dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen);
+ if (!dptr)
+ return 0;
+ dptr += hdr->slen;
+
+ *matchoff = dptr - start;
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff += shift;
+ return 1;
+}
+
+/* Walk through headers until a parsable one is found or no header of the
+ * given type is left. */
+static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type, int *in_header,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ int ret;
+
+ if (in_header && *in_header) {
+ while (1) {
+ ret = ct_sip_next_header(ct, dptr, dataoff, datalen,
+ type, matchoff, matchlen);
+ if (ret > 0)
+ return ret;
+ if (ret == 0)
+ break;
+ dataoff += *matchoff;
+ }
+ *in_header = 0;
+ }
+
+ while (1) {
+ ret = ct_sip_get_header(ct, dptr, dataoff, datalen,
+ type, matchoff, matchlen);
+ if (ret > 0)
+ break;
+ if (ret == 0)
+ return ret;
+ dataoff += *matchoff;
+ }
+
+ if (in_header)
+ *in_header = 1;
+ return 1;
+}
+
+/* Locate a SIP header, parse the URI and return the offset and length of
+ * the address as well as the address and port themselves. A stream of
+ * headers can be parsed by handing in a non-NULL datalen and in_header
+ * pointer.
+ */
+int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
+ unsigned int *dataoff, unsigned int datalen,
+ enum sip_header_types type, int *in_header,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const char *c, *limit = dptr + datalen;
+ unsigned int p;
+ int ret;
+
+ ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen,
+ type, in_header, matchoff, matchlen);
+ WARN_ON(ret < 0);
+ if (ret == 0)
+ return ret;
+
+ if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))
+ return -1;
+ if (*c == ':') {
+ c++;
+ p = simple_strtoul(c, (char **)&c, 10);
+ if (p < 1024 || p > 65535)
+ return -1;
+ *port = htons(p);
+ } else
+ *port = htons(SIP_PORT);
+
+ if (dataoff)
+ *dataoff = c - dptr;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri);
+
+static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const char *limit = dptr + datalen;
+ const char *start;
+ const char *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+ start += strlen(name);
+
+ end = ct_sip_header_search(start, limit, ";", strlen(";"));
+ if (!end)
+ end = limit;
+
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ return 1;
+}
+
+/* Parse address from header parameter and return address, offset and length */
+int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, bool delim)
+{
+ const char *limit = dptr + datalen;
+ const char *start, *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+
+ start += strlen(name);
+ if (!sip_parse_addr(ct, start, &end, addr, limit, delim))
+ return 0;
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_address_param);
+
+/* Parse numerical header parameter and return value, offset and length */
+int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen,
+ unsigned int *val)
+{
+ const char *limit = dptr + datalen;
+ const char *start;
+ char *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+
+ start += strlen(name);
+ *val = simple_strtoul(start, &end, 0);
+ if (start == end)
+ return 0;
+ if (matchoff && matchlen) {
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param);
+
+static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ u8 *proto)
+{
+ unsigned int matchoff, matchlen;
+
+ if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=",
+ &matchoff, &matchlen)) {
+ if (!strnicmp(dptr + matchoff, "TCP", strlen("TCP")))
+ *proto = IPPROTO_TCP;
+ else if (!strnicmp(dptr + matchoff, "UDP", strlen("UDP")))
+ *proto = IPPROTO_UDP;
+ else
+ return 0;
+
+ if (*proto != nf_ct_protonum(ct))
+ return 0;
+ } else
+ *proto = nf_ct_protonum(ct);
+
+ return 1;
+}
+
+static int sdp_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit)
+{
+ const char *end;
+ int ret;
+
+ memset(addr, 0, sizeof(*addr));
+ switch (nf_ct_l3num(ct)) {
+ case AF_INET:
+ ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ break;
+ case AF_INET6:
+ ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ break;
+ default:
+ BUG();
+ }
+
+ if (ret == 0)
+ return 0;
+ if (endp)
+ *endp = end;
+ return 1;
+}
+
+/* skip ip address. returns its length. */
+static int sdp_addr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ union nf_inet_addr addr;
+ const char *aux = dptr;
+
+ if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) {
+ pr_debug("ip: %s parse failed.!\n", dptr);
+ return 0;
+ }
+
+ return dptr - aux;
+}
+
+/* SDP header parsing: a SDP session description contains an ordered set of
+ * headers, starting with a section containing general session parameters,
+ * optionally followed by multiple media descriptions.
+ *
+ * SDP headers always start at the beginning of a line. According to RFC 2327:
+ * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should
+ * be tolerant and also accept records terminated with a single newline
+ * character". We handle both cases.
+ */
+static const struct sip_header ct_sdp_hdrs_v4[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+};
+
+static const struct sip_header ct_sdp_hdrs_v6[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+};
+
+/* Linear string search within SDP header values */
+static const char *ct_sdp_header_search(const char *dptr, const char *limit,
+ const char *needle, unsigned int len)
+{
+ for (limit -= len; dptr < limit; dptr++) {
+ if (*dptr == '\r' || *dptr == '\n')
+ break;
+ if (strncmp(dptr, needle, len) == 0)
+ return dptr;
+ }
+ return NULL;
+}
+
+/* Locate a SDP header (optionally a substring within the header value),
+ * optionally stopping at the first occurrence of the term header, parse
+ * it and return the offset and length of the data we're interested in.
+ */
+int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdrs, *hdr, *thdr;
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6;
+ hdr = &hdrs[type];
+ thdr = &hdrs[term];
+
+ for (dptr += dataoff; dptr < limit; dptr++) {
+ /* Find beginning of line */
+ if (*dptr != '\r' && *dptr != '\n')
+ continue;
+ if (++dptr >= limit)
+ break;
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ break;
+ }
+
+ if (term != SDP_HDR_UNSPEC &&
+ limit - dptr >= thdr->len &&
+ strnicmp(dptr, thdr->name, thdr->len) == 0)
+ break;
+ else if (limit - dptr >= hdr->len &&
+ strnicmp(dptr, hdr->name, hdr->len) == 0)
+ dptr += hdr->len;
+ else
+ continue;
+
+ *matchoff = dptr - start;
+ if (hdr->search) {
+ dptr = ct_sdp_header_search(dptr, limit, hdr->search,
+ hdr->slen);
+ if (!dptr)
+ return -1;
+ dptr += hdr->slen;
+ }
+
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff = dptr - start + shift;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header);
+
+static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr)
+{
+ int ret;
+
+ ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term,
+ matchoff, matchlen);
+ if (ret <= 0)
+ return ret;
+
+ if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr,
+ dptr + *matchoff + *matchlen))
+ return -1;
+ return 1;
+}
+
+static int refresh_signalling_expectation(struct nf_conn *ct,
+ union nf_inet_addr *addr,
+ u8 proto, __be16 port,
+ unsigned int expires)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+ int found = 0;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if (exp->class != SIP_EXPECT_SIGNALLING ||
+ !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
+ exp->tuple.dst.protonum != proto ||
+ exp->tuple.dst.u.udp.port != port)
+ continue;
+ if (!del_timer(&exp->timeout))
+ continue;
+ exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+ exp->timeout.expires = jiffies + expires * HZ;
+ add_timer(&exp->timeout);
+ found = 1;
+ break;
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ return found;
+}
+
+static void flush_expectations(struct nf_conn *ct, bool media)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
+ continue;
+ if (!del_timer(&exp->timeout))
+ continue;
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ if (!media)
+ break;
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+
+static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ union nf_inet_addr *daddr, __be16 port,
+ enum sip_expectation_classes class,
+ unsigned int mediaoff, unsigned int medialen)
+{
+ struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct net *net = nf_ct_net(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ union nf_inet_addr *saddr;
+ struct nf_conntrack_tuple tuple;
+ int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;
+ u_int16_t base_port;
+ __be16 rtp_port, rtcp_port;
+ const struct nf_nat_sip_hooks *hooks;
+
+ saddr = NULL;
+ if (sip_direct_media) {
+ if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3))
+ return NF_ACCEPT;
+ saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ }
+
+ /* We need to check whether the registration exists before attempting
+ * to register it since we can see the same media description multiple
+ * times on different connections in case multiple endpoints receive
+ * the same call.
+ *
+ * RTP optimization: if we find a matching media channel expectation
+ * and both the expectation and this connection are SNATed, we assume
+ * both sides can reach each other directly and use the final
+ * destination address from the expectation. We still need to keep
+ * the NATed expectations for media that might arrive from the
+ * outside, and additionally need to expect the direct RTP stream
+ * in case it passes through us even without NAT.
+ */
+ memset(&tuple, 0, sizeof(tuple));
+ if (saddr)
+ tuple.src.u3 = *saddr;
+ tuple.src.l3num = nf_ct_l3num(ct);
+ tuple.dst.protonum = IPPROTO_UDP;
+ tuple.dst.u3 = *daddr;
+ tuple.dst.u.udp.port = port;
+
+ rcu_read_lock();
+ do {
+ exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+
+ if (!exp || exp->master == ct ||
+ nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
+ exp->class != class)
+ break;
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (!direct_rtp &&
+ (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||
+ exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
+ ct->status & IPS_NAT_MASK) {
+ *daddr = exp->saved_addr;
+ tuple.dst.u3 = exp->saved_addr;
+ tuple.dst.u.udp.port = exp->saved_proto.udp.port;
+ direct_rtp = 1;
+ } else
+#endif
+ skip_expect = 1;
+ } while (!skip_expect);
+
+ base_port = ntohs(tuple.dst.u.udp.port) & ~1;
+ rtp_port = htons(base_port);
+ rtcp_port = htons(base_port + 1);
+
+ if (direct_rtp) {
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks &&
+ !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen,
+ mediaoff, medialen, ntohs(rtp_port)))
+ goto err1;
+ }
+
+ if (skip_expect) {
+ rcu_read_unlock();
+ return NF_ACCEPT;
+ }
+
+ rtp_exp = nf_ct_expect_alloc(ct);
+ if (rtp_exp == NULL)
+ goto err1;
+ nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+ IPPROTO_UDP, NULL, &rtp_port);
+
+ rtcp_exp = nf_ct_expect_alloc(ct);
+ if (rtcp_exp == NULL)
+ goto err2;
+ nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+ IPPROTO_UDP, NULL, &rtcp_port);
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp)
+ ret = hooks->sdp_media(skb, protoff, dataoff, dptr,
+ datalen, rtp_exp, rtcp_exp,
+ mediaoff, medialen, daddr);
+ else {
+ if (nf_ct_expect_related(rtp_exp) == 0) {
+ if (nf_ct_expect_related(rtcp_exp) != 0)
+ nf_ct_unexpect_related(rtp_exp);
+ else
+ ret = NF_ACCEPT;
+ }
+ }
+ nf_ct_expect_put(rtcp_exp);
+err2:
+ nf_ct_expect_put(rtp_exp);
+err1:
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct sdp_media_type sdp_media_types[] = {
+ SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO),
+ SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO),
+ SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE),
+};
+
+static const struct sdp_media_type *sdp_media_type(const char *dptr,
+ unsigned int matchoff,
+ unsigned int matchlen)
+{
+ const struct sdp_media_type *t;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) {
+ t = &sdp_media_types[i];
+ if (matchlen < t->len ||
+ strncmp(dptr + matchoff, t->name, t->len))
+ continue;
+ return t;
+ }
+ return NULL;
+}
+
+static int process_sdp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen;
+ unsigned int mediaoff, medialen;
+ unsigned int sdpoff;
+ unsigned int caddr_len, maddr_len;
+ unsigned int i;
+ union nf_inet_addr caddr, maddr, rtp_addr;
+ const struct nf_nat_sip_hooks *hooks;
+ unsigned int port;
+ const struct sdp_media_type *t;
+ int ret = NF_ACCEPT;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+
+ /* Find beginning of session description */
+ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+ SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+ &matchoff, &matchlen) <= 0)
+ return NF_ACCEPT;
+ sdpoff = matchoff;
+
+ /* The connection information is contained in the session description
+ * and/or once per media description. The first media description marks
+ * the end of the session description. */
+ caddr_len = 0;
+ if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ &matchoff, &matchlen, &caddr) > 0)
+ caddr_len = matchlen;
+
+ mediaoff = sdpoff;
+ for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) {
+ if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen,
+ SDP_HDR_MEDIA, SDP_HDR_UNSPEC,
+ &mediaoff, &medialen) <= 0)
+ break;
+
+ /* Get media type and port number. A media port value of zero
+ * indicates an inactive stream. */
+ t = sdp_media_type(*dptr, mediaoff, medialen);
+ if (!t) {
+ mediaoff += medialen;
+ continue;
+ }
+ mediaoff += t->len;
+ medialen -= t->len;
+
+ port = simple_strtoul(*dptr + mediaoff, NULL, 10);
+ if (port == 0)
+ continue;
+ if (port < 1024 || port > 65535) {
+ nf_ct_helper_log(skb, ct, "wrong port %u", port);
+ return NF_DROP;
+ }
+
+ /* The media description overrides the session description. */
+ maddr_len = 0;
+ if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ &matchoff, &matchlen, &maddr) > 0) {
+ maddr_len = matchlen;
+ memcpy(&rtp_addr, &maddr, sizeof(rtp_addr));
+ } else if (caddr_len)
+ memcpy(&rtp_addr, &caddr, sizeof(rtp_addr));
+ else {
+ nf_ct_helper_log(skb, ct, "cannot parse SDP message");
+ return NF_DROP;
+ }
+
+ ret = set_expected_rtp_rtcp(skb, protoff, dataoff,
+ dptr, datalen,
+ &rtp_addr, htons(port), t->class,
+ mediaoff, medialen);
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation for voice");
+ return ret;
+ }
+
+ /* Update media connection address if present */
+ if (maddr_len && hooks && ct->status & IPS_NAT_MASK) {
+ ret = hooks->sdp_addr(skb, protoff, dataoff,
+ dptr, datalen, mediaoff,
+ SDP_HDR_CONNECTION,
+ SDP_HDR_MEDIA,
+ &rtp_addr);
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP");
+ return ret;
+ }
+ }
+ i++;
+ }
+
+ /* Update session connection and owner addresses */
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->sdp_session(skb, protoff, dataoff,
+ dptr, datalen, sdpoff,
+ &rtp_addr);
+
+ return ret;
+}
+static int process_invite_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_update_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_prack_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_invite_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ unsigned int ret;
+
+ flush_expectations(ct, true);
+ ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ if (ret == NF_ACCEPT)
+ ct_sip_info->invite_cseq = cseq;
+ return ret;
+}
+
+static int process_bye_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+/* Parse a REGISTER request and create a permanent expectation for incoming
+ * signalling connections. The expectation is marked inactive and is activated
+ * when receiving a response indicating success from the registrar.
+ */
+static int process_register_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int matchoff, matchlen;
+ struct nf_conntrack_expect *exp;
+ union nf_inet_addr *saddr, daddr;
+ const struct nf_nat_sip_hooks *hooks;
+ __be16 port;
+ u8 proto;
+ unsigned int expires = 0;
+ int ret;
+
+ /* Expected connections can not register again. */
+ if (ct->status & IPS_EXPECTED)
+ return NF_ACCEPT;
+
+ /* We must check the expiration time: a value of zero signals the
+ * registrar to release the binding. We'll remove our expectation
+ * when receiving the new bindings in the response, but we don't
+ * want to create new ones.
+ *
+ * The expiration time may be contained in Expires: header, the
+ * Contact: header parameters or the URI parameters.
+ */
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+ &matchoff, &matchlen) > 0)
+ expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+ ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ SIP_HDR_CONTACT, NULL,
+ &matchoff, &matchlen, &daddr, &port);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
+ return NF_DROP;
+ } else if (ret == 0)
+ return NF_ACCEPT;
+
+ /* We don't support third-party registrations */
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr))
+ return NF_ACCEPT;
+
+ if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen,
+ &proto) == 0)
+ return NF_ACCEPT;
+
+ if (ct_sip_parse_numerical_param(ct, *dptr,
+ matchoff + matchlen, *datalen,
+ "expires=", NULL, NULL, &expires) < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
+ return NF_DROP;
+ }
+
+ if (expires == 0) {
+ ret = NF_ACCEPT;
+ goto store_cseq;
+ }
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ return NF_DROP;
+ }
+
+ saddr = NULL;
+ if (sip_direct_signalling)
+ saddr = &ct->tuplehash[!dir].tuple.src.u3;
+
+ nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
+ saddr, &daddr, proto, NULL, &port);
+ exp->timeout.expires = sip_timeout * HZ;
+ exp->helper = nfct_help(ct)->helper;
+ exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->expect(skb, protoff, dataoff, dptr, datalen,
+ exp, matchoff, matchlen);
+ else {
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ } else
+ ret = NF_ACCEPT;
+ }
+ nf_ct_expect_put(exp);
+
+store_cseq:
+ if (ret == NF_ACCEPT)
+ ct_sip_info->register_cseq = cseq;
+ return ret;
+}
+
+static int process_register_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ union nf_inet_addr addr;
+ __be16 port;
+ u8 proto;
+ unsigned int matchoff, matchlen, coff = 0;
+ unsigned int expires = 0;
+ int in_contact = 0, ret;
+
+ /* According to RFC 3261, "UAs MUST NOT send a new registration until
+ * they have received a final response from the registrar for the
+ * previous one or the previous REGISTER request has timed out".
+ *
+ * However, some servers fail to detect retransmissions and send late
+ * responses, so we store the sequence number of the last valid
+ * request and compare it here.
+ */
+ if (ct_sip_info->register_cseq != cseq)
+ return NF_ACCEPT;
+
+ if (code >= 100 && code <= 199)
+ return NF_ACCEPT;
+ if (code < 200 || code > 299)
+ goto flush;
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+ &matchoff, &matchlen) > 0)
+ expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+ while (1) {
+ unsigned int c_expires = expires;
+
+ ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+ SIP_HDR_CONTACT, &in_contact,
+ &matchoff, &matchlen,
+ &addr, &port);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
+ return NF_DROP;
+ } else if (ret == 0)
+ break;
+
+ /* We don't support third-party registrations */
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr))
+ continue;
+
+ if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen,
+ *datalen, &proto) == 0)
+ continue;
+
+ ret = ct_sip_parse_numerical_param(ct, *dptr,
+ matchoff + matchlen,
+ *datalen, "expires=",
+ NULL, NULL, &c_expires);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
+ return NF_DROP;
+ }
+ if (c_expires == 0)
+ break;
+ if (refresh_signalling_expectation(ct, &addr, proto, port,
+ c_expires))
+ return NF_ACCEPT;
+ }
+
+flush:
+ flush_expectations(ct, false);
+ return NF_ACCEPT;
+}
+
+static const struct sip_handler sip_handlers[] = {
+ SIP_HANDLER("INVITE", process_invite_request, process_invite_response),
+ SIP_HANDLER("UPDATE", process_sdp, process_update_response),
+ SIP_HANDLER("ACK", process_sdp, NULL),
+ SIP_HANDLER("PRACK", process_sdp, process_prack_response),
+ SIP_HANDLER("BYE", process_bye_request, NULL),
+ SIP_HANDLER("REGISTER", process_register_request, process_register_response),
+};
+
+static int process_sip_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen, matchend;
+ unsigned int code, cseq, i;
+
+ if (*datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+ code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10);
+ if (!code) {
+ nf_ct_helper_log(skb, ct, "cannot get code");
+ return NF_DROP;
+ }
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
+ return NF_DROP;
+ }
+ cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
+ return NF_DROP;
+ }
+ matchend = matchoff + matchlen + 1;
+
+ for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+ const struct sip_handler *handler;
+
+ handler = &sip_handlers[i];
+ if (handler->response == NULL)
+ continue;
+ if (*datalen < matchend + handler->len ||
+ strnicmp(*dptr + matchend, handler->method, handler->len))
+ continue;
+ return handler->response(skb, protoff, dataoff, dptr, datalen,
+ cseq, code);
+ }
+ return NF_ACCEPT;
+}
+
+static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int matchoff, matchlen;
+ unsigned int cseq, i;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ /* Many Cisco IP phones use a high source port for SIP requests, but
+ * listen for the response on port 5060. If we are the local
+ * router for one of these phones, save the port number from the
+ * Via: header so that nf_nat_sip can redirect the responses to
+ * the correct port.
+ */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ SIP_HDR_VIA_UDP, NULL, &matchoff,
+ &matchlen, &addr, &port) > 0 &&
+ port != ct->tuplehash[dir].tuple.src.u.udp.port &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3))
+ ct_sip_info->forced_dport = port;
+
+ for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+ const struct sip_handler *handler;
+
+ handler = &sip_handlers[i];
+ if (handler->request == NULL)
+ continue;
+ if (*datalen < handler->len ||
+ strnicmp(*dptr, handler->method, handler->len))
+ continue;
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
+ return NF_DROP;
+ }
+ cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
+ return NF_DROP;
+ }
+
+ return handler->request(skb, protoff, dataoff, dptr, datalen,
+ cseq);
+ }
+ return NF_ACCEPT;
+}
+
+static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,
+ unsigned int protoff, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ const struct nf_nat_sip_hooks *hooks;
+ int ret;
+
+ if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0)
+ ret = process_sip_request(skb, protoff, dataoff, dptr, datalen);
+ else
+ ret = process_sip_response(skb, protoff, dataoff, dptr, datalen);
+
+ if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && !hooks->msg(skb, protoff, dataoff,
+ dptr, datalen)) {
+ nf_ct_helper_log(skb, ct, "cannot NAT SIP message");
+ ret = NF_DROP;
+ }
+ }
+
+ return ret;
+}
+
+static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ struct tcphdr *th, _tcph;
+ unsigned int dataoff, datalen;
+ unsigned int matchoff, matchlen, clen;
+ unsigned int msglen, origlen;
+ const char *dptr, *end;
+ s16 diff, tdiff = 0;
+ int ret = NF_ACCEPT;
+ bool term;
+
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* No Data ? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+ dataoff = protoff + th->doff * 4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+ if (datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+
+ while (1) {
+ if (ct_sip_get_header(ct, dptr, 0, datalen,
+ SIP_HDR_CONTENT_LENGTH,
+ &matchoff, &matchlen) <= 0)
+ break;
+
+ clen = simple_strtoul(dptr + matchoff, (char **)&end, 10);
+ if (dptr + matchoff == end)
+ break;
+
+ term = false;
+ for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) {
+ if (end[0] == '\r' && end[1] == '\n' &&
+ end[2] == '\r' && end[3] == '\n') {
+ term = true;
+ break;
+ }
+ }
+ if (!term)
+ break;
+ end += strlen("\r\n\r\n") + clen;
+
+ msglen = origlen = end - dptr;
+ if (msglen > datalen)
+ return NF_ACCEPT;
+
+ ret = process_sip_msg(skb, ct, protoff, dataoff,
+ &dptr, &msglen);
+ /* process_sip_* functions report why this packet is dropped */
+ if (ret != NF_ACCEPT)
+ break;
+ diff = msglen - origlen;
+ tdiff += diff;
+
+ dataoff += msglen;
+ dptr += msglen;
+ datalen = datalen + diff - msglen;
+ }
+
+ if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ const struct nf_nat_sip_hooks *hooks;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks)
+ hooks->seq_adjust(skb, protoff, tdiff);
+ }
+
+ return ret;
+}
+
+static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ const char *dptr;
+
+ /* No Data ? */
+ dataoff = protoff + sizeof(struct udphdr);
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+ if (datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+
+ return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);
+}
+
+static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
+ [SIP_EXPECT_SIGNALLING] = {
+ .name = "signalling",
+ .max_expected = 1,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_AUDIO] = {
+ .name = "audio",
+ .max_expected = 2 * IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_VIDEO] = {
+ .name = "video",
+ .max_expected = 2 * IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_IMAGE] = {
+ .name = "image",
+ .max_expected = IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+};
+
+static void nf_conntrack_sip_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+ if (sip[i][j].me == NULL)
+ continue;
+ nf_conntrack_helper_unregister(&sip[i][j]);
+ }
+ }
+}
+
+static int __init nf_conntrack_sip_init(void)
+{
+ int i, j, ret;
+
+ if (ports_c == 0)
+ ports[ports_c++] = SIP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ memset(&sip[i], 0, sizeof(sip[i]));
+
+ sip[i][0].tuple.src.l3num = AF_INET;
+ sip[i][0].tuple.dst.protonum = IPPROTO_UDP;
+ sip[i][0].help = sip_help_udp;
+ sip[i][1].tuple.src.l3num = AF_INET;
+ sip[i][1].tuple.dst.protonum = IPPROTO_TCP;
+ sip[i][1].help = sip_help_tcp;
+
+ sip[i][2].tuple.src.l3num = AF_INET6;
+ sip[i][2].tuple.dst.protonum = IPPROTO_UDP;
+ sip[i][2].help = sip_help_udp;
+ sip[i][3].tuple.src.l3num = AF_INET6;
+ sip[i][3].tuple.dst.protonum = IPPROTO_TCP;
+ sip[i][3].help = sip_help_tcp;
+
+ for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+ sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
+ sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
+ sip[i][j].expect_policy = sip_exp_policy;
+ sip[i][j].expect_class_max = SIP_EXPECT_MAX;
+ sip[i][j].me = THIS_MODULE;
+
+ if (ports[i] == SIP_PORT)
+ sprintf(sip[i][j].name, "sip");
+ else
+ sprintf(sip[i][j].name, "sip-%u", i);
+
+ pr_debug("port #%u: %u\n", i, ports[i]);
+
+ ret = nf_conntrack_helper_register(&sip[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_sip: failed to register"
+ " helper for pf: %u port: %u\n",
+ sip[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_sip_fini();
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+module_init(nf_conntrack_sip_init);
+module_exit(nf_conntrack_sip_fini);
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 00000000000..87b95a2c270
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,78 @@
+/*
+ * SNMP service broadcast connection tracking helper
+ *
+ * (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+#define SNMP_PORT 161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+ nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+ nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+ if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+ return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "snmp",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = snmp_conntrack_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 617599aeeea..f641751dba9 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,127 +1,94 @@
-/* This file contains all the functions required for the standalone
- nf_conntrack module.
-
- These are not required by the compatibility layer.
-*/
-
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
- * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- * - generalize L3 protocol dependent part.
- *
- * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/netfilter.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/percpu.h>
#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_protocol.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
MODULE_LICENSE("GPL");
-extern atomic_t nf_conntrack_count;
-DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
-
-static int kill_l3proto(struct nf_conn *i, void *data)
-{
- return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
- ((struct nf_conntrack_l3proto *)data)->l3proto);
-}
-
-static int kill_proto(struct nf_conn *i, void *data)
-{
- struct nf_conntrack_protocol *proto;
- proto = (struct nf_conntrack_protocol *)data;
- return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
- proto->proto) &&
- (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
- proto->l3proto);
-}
-
-#ifdef CONFIG_PROC_FS
-static int
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+int
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_l3proto *l3proto,
- struct nf_conntrack_protocol *proto)
-{
- return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple);
-}
-
-#ifdef CONFIG_NF_CT_ACCT
-static unsigned int
-seq_print_counters(struct seq_file *s,
- const struct ip_conntrack_counter *counter)
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_l4proto *l4proto)
{
- return seq_printf(s, "packets=%llu bytes=%llu ",
- (unsigned long long)counter->packets,
- (unsigned long long)counter->bytes);
+ return l3proto->print_tuple(s, tuple) || l4proto->print_tuple(s, tuple);
}
-#else
-#define seq_print_counters(x, y) 0
-#endif
+EXPORT_SYMBOL_GPL(print_tuple);
struct ct_iter_state {
+ struct seq_net_private p;
unsigned int bucket;
+ u_int64_t time_now;
};
-static struct list_head *ct_get_first(struct seq_file *seq)
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
+ struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
+ struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < nf_conntrack_htable_size;
+ st->bucket < net->ct.htable_size;
st->bucket++) {
- if (!list_empty(&nf_conntrack_hash[st->bucket]))
- return nf_conntrack_hash[st->bucket].next;
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ if (!is_a_nulls(n))
+ return n;
}
return NULL;
}
-static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+ struct hlist_nulls_node *head)
{
+ struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = head->next;
- while (head == &nf_conntrack_hash[st->bucket]) {
- if (++st->bucket >= nf_conntrack_htable_size)
- return NULL;
- head = nf_conntrack_hash[st->bucket].next;
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
+ while (is_a_nulls(head)) {
+ if (likely(get_nulls_value(head) == st->bucket)) {
+ if (++st->bucket >= net->ct.htable_size)
+ return NULL;
+ }
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
}
return head;
}
-static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
{
- struct list_head *head = ct_get_first(seq);
+ struct hlist_nulls_node *head = ct_get_first(seq);
if (head)
while (pos && (head = ct_get_next(seq, head)))
@@ -130,8 +97,12 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
}
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
- read_lock_bh(&nf_conntrack_lock);
+ struct ct_iter_state *st = seq->private;
+
+ st->time_now = ktime_to_ns(ktime_get_real());
+ rcu_read_lock();
return ct_get_idx(seq, *pos);
}
@@ -142,84 +113,143 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void ct_seq_stop(struct seq_file *s, void *v)
+ __releases(RCU)
{
- read_unlock_bh(&nf_conntrack_lock);
+ rcu_read_unlock();
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return 0;
+
+ ret = seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ struct ct_iter_state *st = s->private;
+ struct nf_conn_tstamp *tstamp;
+ s64 delta_time;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ delta_time = st->time_now - tstamp->start;
+ if (delta_time > 0)
+ delta_time = div_s64(delta_time, NSEC_PER_SEC);
+ else
+ delta_time = 0;
+
+ return seq_printf(s, "delta-time=%llu ",
+ (unsigned long long)delta_time);
+ }
+ return 0;
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
- const struct nf_conntrack_tuple_hash *hash = v;
- const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash);
- struct nf_conntrack_l3proto *l3proto;
- struct nf_conntrack_protocol *proto;
+ struct nf_conntrack_tuple_hash *hash = v;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
- ASSERT_READ_LOCK(&nf_conntrack_lock);
- NF_CT_ASSERT(conntrack);
+ NF_CT_ASSERT(ct);
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
/* we only want to print DIR_ORIGINAL */
if (NF_CT_DIRECTION(hash))
- return 0;
-
- l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.src.l3num);
+ goto release;
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
NF_CT_ASSERT(l3proto);
- proto = __nf_ct_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.src.l3num,
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.protonum);
- NF_CT_ASSERT(proto);
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ NF_CT_ASSERT(l4proto);
+ ret = -ENOSPC;
if (seq_printf(s, "%-8s %u %-8s %u %ld ",
- l3proto->name,
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num,
- proto->name,
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
- timer_pending(&conntrack->timeout)
- ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0)
- return -ENOSPC;
+ l3proto->name, nf_ct_l3num(ct),
+ l4proto->name, nf_ct_protonum(ct),
+ timer_pending(&ct->timeout)
+ ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
+ goto release;
- if (l3proto->print_conntrack(s, conntrack))
- return -ENOSPC;
+ if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
+ goto release;
- if (proto->print_conntrack(s, conntrack))
- return -ENOSPC;
+ if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, l4proto))
+ goto release;
- if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, proto))
- return -ENOSPC;
+ if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+ goto release;
- if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
- return -ENOSPC;
-
- if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
if (seq_printf(s, "[UNREPLIED] "))
- return -ENOSPC;
+ goto release;
- if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, proto))
- return -ENOSPC;
+ if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, l4proto))
+ goto release;
- if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
- return -ENOSPC;
+ if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+ goto release;
- if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
if (seq_printf(s, "[ASSURED] "))
- return -ENOSPC;
+ goto release;
#if defined(CONFIG_NF_CONNTRACK_MARK)
- if (seq_printf(s, "mark=%u ", conntrack->mark))
- return -ENOSPC;
+ if (seq_printf(s, "mark=%u ", ct->mark))
+ goto release;
#endif
- if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
- return -ENOSPC;
-
- return 0;
+ if (ct_show_secctx(s, ct))
+ goto release;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
+ goto release;
+#endif
+
+ if (ct_show_delta_time(s, ct))
+ goto release;
+
+ if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
+ goto release;
+
+ ret = 0;
+release:
+ nf_ct_put(ct);
+ return ret;
}
-static struct seq_operations ct_seq_ops = {
+static const struct seq_operations ct_seq_ops = {
.start = ct_seq_start,
.next = ct_seq_next,
.stop = ct_seq_stop,
@@ -228,123 +258,31 @@ static struct seq_operations ct_seq_ops = {
static int ct_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- struct ct_iter_state *st;
- int ret;
-
- st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
- if (st == NULL)
- return -ENOMEM;
- ret = seq_open(file, &ct_seq_ops);
- if (ret)
- goto out_free;
- seq = file->private_data;
- seq->private = st;
- memset(st, 0, sizeof(struct ct_iter_state));
- return ret;
-out_free:
- kfree(st);
- return ret;
+ return seq_open_net(inode, file, &ct_seq_ops,
+ sizeof(struct ct_iter_state));
}
-static struct file_operations ct_file_ops = {
+static const struct file_operations ct_file_ops = {
.owner = THIS_MODULE,
.open = ct_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-/* expects */
-static void *exp_seq_start(struct seq_file *s, loff_t *pos)
-{
- struct list_head *e = &nf_conntrack_expect_list;
- loff_t i;
-
- /* strange seq_file api calls stop even if we fail,
- * thus we need to grab lock since stop unlocks */
- read_lock_bh(&nf_conntrack_lock);
-
- if (list_empty(e))
- return NULL;
-
- for (i = 0; i <= *pos; i++) {
- e = e->next;
- if (e == &nf_conntrack_expect_list)
- return NULL;
- }
- return e;
-}
-
-static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct list_head *e = v;
-
- ++*pos;
- e = e->next;
-
- if (e == &nf_conntrack_expect_list)
- return NULL;
-
- return e;
-}
-
-static void exp_seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&nf_conntrack_lock);
-}
-
-static int exp_seq_show(struct seq_file *s, void *v)
-{
- struct nf_conntrack_expect *expect = v;
-
- if (expect->timeout.function)
- seq_printf(s, "%ld ", timer_pending(&expect->timeout)
- ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
- else
- seq_printf(s, "- ");
- seq_printf(s, "l3proto = %u proto=%u ",
- expect->tuple.src.l3num,
- expect->tuple.dst.protonum);
- print_tuple(s, &expect->tuple,
- __nf_ct_l3proto_find(expect->tuple.src.l3num),
- __nf_ct_proto_find(expect->tuple.src.l3num,
- expect->tuple.dst.protonum));
- return seq_putc(s, '\n');
-}
-
-static struct seq_operations exp_seq_ops = {
- .start = exp_seq_start,
- .next = exp_seq_next,
- .stop = exp_seq_stop,
- .show = exp_seq_show
-};
-
-static int exp_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &exp_seq_ops);
-}
-
-static struct file_operations exp_file_ops = {
- .owner = THIS_MODULE,
- .open = exp_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
+ .release = seq_release_net,
};
static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
- return &per_cpu(nf_conntrack_stat, cpu);
+ return per_cpu_ptr(net->ct.stat, cpu);
}
return NULL;
@@ -352,13 +290,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
int cpu;
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
- return &per_cpu(nf_conntrack_stat, cpu);
+ return per_cpu_ptr(net->ct.stat, cpu);
}
return NULL;
@@ -370,16 +309,17 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
static int ct_cpu_seq_show(struct seq_file *seq, void *v)
{
- unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
- struct ip_conntrack_stat *st = v;
+ struct net *net = seq_file_net(seq);
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ const struct ip_conntrack_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
- "%08x %08x %08x %08x %08x %08x %08x %08x \n",
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
st->searched,
st->found,
@@ -396,12 +336,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
st->expect_new,
st->expect_create,
- st->expect_delete
+ st->expect_delete,
+ st->search_restart
);
return 0;
}
-static struct seq_operations ct_cpu_seq_ops = {
+static const struct seq_operations ct_cpu_seq_ops = {
.start = ct_cpu_seq_start,
.next = ct_cpu_seq_next,
.stop = ct_cpu_seq_stop,
@@ -410,482 +351,266 @@ static struct seq_operations ct_cpu_seq_ops = {
static int ct_cpu_seq_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ct_cpu_seq_ops);
+ return seq_open_net(inode, file, &ct_cpu_seq_ops,
+ sizeof(struct seq_net_private));
}
-static struct file_operations ct_cpu_seq_fops = {
+static const struct file_operations ct_cpu_seq_fops = {
.owner = THIS_MODULE,
.open = ct_cpu_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-#endif /* CONFIG_PROC_FS */
-/* Sysctl support */
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
-#ifdef CONFIG_SYSCTL
+ pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+ if (!pde)
+ goto out_nf_conntrack;
-/* From nf_conntrack_core.c */
-extern int nf_conntrack_max;
-extern unsigned int nf_conntrack_htable_size;
-
-/* From nf_conntrack_proto_tcp.c */
-extern unsigned int nf_ct_tcp_timeout_syn_sent;
-extern unsigned int nf_ct_tcp_timeout_syn_recv;
-extern unsigned int nf_ct_tcp_timeout_established;
-extern unsigned int nf_ct_tcp_timeout_fin_wait;
-extern unsigned int nf_ct_tcp_timeout_close_wait;
-extern unsigned int nf_ct_tcp_timeout_last_ack;
-extern unsigned int nf_ct_tcp_timeout_time_wait;
-extern unsigned int nf_ct_tcp_timeout_close;
-extern unsigned int nf_ct_tcp_timeout_max_retrans;
-extern int nf_ct_tcp_loose;
-extern int nf_ct_tcp_be_liberal;
-extern int nf_ct_tcp_max_retrans;
-
-/* From nf_conntrack_proto_udp.c */
-extern unsigned int nf_ct_udp_timeout;
-extern unsigned int nf_ct_udp_timeout_stream;
-
-/* From nf_conntrack_proto_generic.c */
-extern unsigned int nf_ct_generic_timeout;
+ pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+ &ct_cpu_seq_fops);
+ if (!pde)
+ goto out_stat_nf_conntrack;
+ return 0;
+out_stat_nf_conntrack:
+ remove_proc_entry("nf_conntrack", net->proc_net);
+out_nf_conntrack:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_conntrack", net->proc_net_stat);
+ remove_proc_entry("nf_conntrack", net->proc_net);
+}
+#else
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+}
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
/* Log invalid packets of a given protocol */
static int log_invalid_proto_min = 0;
static int log_invalid_proto_max = 255;
-static struct ctl_table_header *nf_ct_sysctl_header;
+static struct ctl_table_header *nf_ct_netfilter_header;
-static ctl_table nf_ct_sysctl_table[] = {
+static struct ctl_table nf_ct_sysctl_table[] = {
{
- .ctl_name = NET_NF_CONNTRACK_MAX,
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_NF_CONNTRACK_COUNT,
.procname = "nf_conntrack_count",
- .data = &nf_conntrack_count,
+ .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_NF_CONNTRACK_BUCKETS,
.procname = "nf_conntrack_buckets",
- .data = &nf_conntrack_htable_size,
+ .data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
- .procname = "nf_conntrack_tcp_timeout_syn_sent",
- .data = &nf_ct_tcp_timeout_syn_sent,
+ .procname = "nf_conntrack_checksum",
+ .data = &init_net.ct.sysctl_checksum,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
- .procname = "nf_conntrack_tcp_timeout_syn_recv",
- .data = &nf_ct_tcp_timeout_syn_recv,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
- .procname = "nf_conntrack_tcp_timeout_established",
- .data = &nf_ct_tcp_timeout_established,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
- .procname = "nf_conntrack_tcp_timeout_fin_wait",
- .data = &nf_ct_tcp_timeout_fin_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
- .procname = "nf_conntrack_tcp_timeout_close_wait",
- .data = &nf_ct_tcp_timeout_close_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
- .procname = "nf_conntrack_tcp_timeout_last_ack",
- .data = &nf_ct_tcp_timeout_last_ack,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
- .procname = "nf_conntrack_tcp_timeout_time_wait",
- .data = &nf_ct_tcp_timeout_time_wait,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
- .procname = "nf_conntrack_tcp_timeout_close",
- .data = &nf_ct_tcp_timeout_close,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT,
- .procname = "nf_conntrack_udp_timeout",
- .data = &nf_ct_udp_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
- .procname = "nf_conntrack_udp_timeout_stream",
- .data = &nf_ct_udp_timeout_stream,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT,
- .procname = "nf_conntrack_generic_timeout",
- .data = &nf_ct_generic_timeout,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_LOG_INVALID,
.procname = "nf_conntrack_log_invalid",
- .data = &nf_ct_log_invalid,
+ .data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &log_invalid_proto_min,
.extra2 = &log_invalid_proto_max,
},
{
- .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
- .procname = "nf_conntrack_tcp_timeout_max_retrans",
- .data = &nf_ct_tcp_timeout_max_retrans,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE,
- .procname = "nf_conntrack_tcp_loose",
- .data = &nf_ct_tcp_loose,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
- .procname = "nf_conntrack_tcp_be_liberal",
- .data = &nf_ct_tcp_be_liberal,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
- .procname = "nf_conntrack_tcp_max_retrans",
- .data = &nf_ct_tcp_max_retrans,
- .maxlen = sizeof(unsigned int),
+ .procname = "nf_conntrack_expect_max",
+ .data = &nf_ct_expect_max,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
-
- { .ctl_name = 0 }
+ { }
};
#define NET_NF_CONNTRACK_MAX 2089
-static ctl_table nf_ct_netfilter_table[] = {
- {
- .ctl_name = NET_NETFILTER,
- .procname = "netfilter",
- .mode = 0555,
- .child = nf_ct_sysctl_table,
- },
+static struct ctl_table nf_ct_netfilter_table[] = {
{
- .ctl_name = NET_NF_CONNTRACK_MAX,
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
- { .ctl_name = 0 }
+ { }
};
-static ctl_table nf_ct_net_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = nf_ct_netfilter_table,
- },
- { .ctl_name = 0 }
-};
-EXPORT_SYMBOL(nf_ct_log_invalid);
-#endif /* CONFIG_SYSCTL */
-
-static int init_or_cleanup(int init)
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
{
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *proc, *proc_exp, *proc_stat;
-#endif
- int ret = 0;
-
- if (!init) goto cleanup;
+ struct ctl_table *table;
- ret = nf_conntrack_init();
- if (ret < 0)
- goto cleanup_nothing;
+ table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out_kmemdup;
-#ifdef CONFIG_PROC_FS
- proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops);
- if (!proc) goto cleanup_init;
+ table[1].data = &net->ct.count;
+ table[2].data = &net->ct.htable_size;
+ table[3].data = &net->ct.sysctl_checksum;
+ table[4].data = &net->ct.sysctl_log_invalid;
- proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440,
- &exp_file_ops);
- if (!proc_exp) goto cleanup_proc;
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
- proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat);
- if (!proc_stat)
- goto cleanup_proc_exp;
+ net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
+ if (!net->ct.sysctl_header)
+ goto out_unregister_netfilter;
- proc_stat->proc_fops = &ct_cpu_seq_fops;
- proc_stat->owner = THIS_MODULE;
-#endif
-#ifdef CONFIG_SYSCTL
- nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
- if (nf_ct_sysctl_header == NULL) {
- printk("nf_conntrack: can't register to sysctl.\n");
- ret = -ENOMEM;
- goto cleanup_proc_stat;
- }
-#endif
-
- return ret;
+ return 0;
- cleanup:
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(nf_ct_sysctl_header);
- cleanup_proc_stat:
-#endif
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nf_conntrack", proc_net_stat);
- cleanup_proc_exp:
- proc_net_remove("nf_conntrack_expect");
- cleanup_proc:
- proc_net_remove("nf_conntrack");
- cleanup_init:
-#endif /* CNFIG_PROC_FS */
- nf_conntrack_cleanup();
- cleanup_nothing:
- return ret;
+out_unregister_netfilter:
+ kfree(table);
+out_kmemdup:
+ return -ENOMEM;
}
-int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
- int ret = 0;
+ struct ctl_table *table;
- write_lock_bh(&nf_conntrack_lock);
- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) {
- ret = -EBUSY;
- goto out;
- }
- nf_ct_l3protos[proto->l3proto] = proto;
-out:
- write_unlock_bh(&nf_conntrack_lock);
-
- return ret;
+ table = net->ct.sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
+{
+ return 0;
}
-void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
- write_lock_bh(&nf_conntrack_lock);
- nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto;
- write_unlock_bh(&nf_conntrack_lock);
-
- /* Somebody could be still looking at the proto in bh. */
- synchronize_net();
-
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_cleanup(kill_l3proto, proto);
}
+#endif /* CONFIG_SYSCTL */
-/* FIXME: Allow NULL functions and sub in pointers to generic for
- them. --RR */
-int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto)
+static int nf_conntrack_pernet_init(struct net *net)
{
- int ret = 0;
+ int ret;
-retry:
- write_lock_bh(&nf_conntrack_lock);
- if (nf_ct_protos[proto->l3proto]) {
- if (nf_ct_protos[proto->l3proto][proto->proto]
- != &nf_conntrack_generic_protocol) {
- ret = -EBUSY;
- goto out_unlock;
- }
- } else {
- /* l3proto may be loaded latter. */
- struct nf_conntrack_protocol **proto_array;
- int i;
-
- write_unlock_bh(&nf_conntrack_lock);
-
- proto_array = (struct nf_conntrack_protocol **)
- kmalloc(MAX_NF_CT_PROTO *
- sizeof(struct nf_conntrack_protocol *),
- GFP_KERNEL);
- if (proto_array == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MAX_NF_CT_PROTO; i++)
- proto_array[i] = &nf_conntrack_generic_protocol;
-
- write_lock_bh(&nf_conntrack_lock);
- if (nf_ct_protos[proto->l3proto]) {
- /* bad timing, but no problem */
- write_unlock_bh(&nf_conntrack_lock);
- kfree(proto_array);
- } else {
- nf_ct_protos[proto->l3proto] = proto_array;
- write_unlock_bh(&nf_conntrack_lock);
- }
+ ret = nf_conntrack_init_net(net);
+ if (ret < 0)
+ goto out_init;
- /*
- * Just once because array is never freed until unloading
- * nf_conntrack.ko
- */
- goto retry;
- }
+ ret = nf_conntrack_standalone_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_log_invalid = 0;
+ ret = nf_conntrack_standalone_init_sysctl(net);
+ if (ret < 0)
+ goto out_sysctl;
- nf_ct_protos[proto->l3proto][proto->proto] = proto;
+ return 0;
-out_unlock:
- write_unlock_bh(&nf_conntrack_lock);
-out:
+out_sysctl:
+ nf_conntrack_standalone_fini_proc(net);
+out_proc:
+ nf_conntrack_cleanup_net(net);
+out_init:
return ret;
}
-void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto)
+static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
{
- write_lock_bh(&nf_conntrack_lock);
- nf_ct_protos[proto->l3proto][proto->proto]
- = &nf_conntrack_generic_protocol;
- write_unlock_bh(&nf_conntrack_lock);
-
- /* Somebody could be still looking at the proto in bh. */
- synchronize_net();
-
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_cleanup(kill_proto, proto);
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_conntrack_standalone_fini_sysctl(net);
+ nf_conntrack_standalone_fini_proc(net);
+ }
+ nf_conntrack_cleanup_net_list(net_exit_list);
}
-static int __init init(void)
+static struct pernet_operations nf_conntrack_net_ops = {
+ .init = nf_conntrack_pernet_init,
+ .exit_batch = nf_conntrack_pernet_exit,
+};
+
+static int __init nf_conntrack_standalone_init(void)
{
- return init_or_cleanup(1);
+ int ret = nf_conntrack_init_start();
+ if (ret < 0)
+ goto out_start;
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_netfilter_header =
+ register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
+ if (!nf_ct_netfilter_header) {
+ pr_err("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto out_sysctl;
+ }
+#endif
+
+ ret = register_pernet_subsys(&nf_conntrack_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ nf_conntrack_init_end();
+ return 0;
+
+out_pernet:
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+out_sysctl:
+#endif
+ nf_conntrack_cleanup_end();
+out_start:
+ return ret;
}
-static void __exit fini(void)
+static void __exit nf_conntrack_standalone_fini(void)
{
- init_or_cleanup(0);
+ nf_conntrack_cleanup_start();
+ unregister_pernet_subsys(&nf_conntrack_net_ops);
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+#endif
+ nf_conntrack_cleanup_end();
}
-module_init(init);
-module_exit(fini);
+module_init(nf_conntrack_standalone_init);
+module_exit(nf_conntrack_standalone_fini);
/* Some modules need us, but don't depend directly on any symbol.
They should call this. */
void need_conntrack(void)
{
}
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-EXPORT_SYMBOL_GPL(nf_conntrack_chain);
-EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain);
-EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
-EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
-EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache);
-EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
-#endif
-EXPORT_SYMBOL(nf_conntrack_l3proto_register);
-EXPORT_SYMBOL(nf_conntrack_l3proto_unregister);
-EXPORT_SYMBOL(nf_conntrack_protocol_register);
-EXPORT_SYMBOL(nf_conntrack_protocol_unregister);
-EXPORT_SYMBOL(nf_ct_invert_tuplepr);
-EXPORT_SYMBOL(nf_conntrack_alter_reply);
-EXPORT_SYMBOL(nf_conntrack_destroyed);
-EXPORT_SYMBOL(need_conntrack);
-EXPORT_SYMBOL(nf_conntrack_helper_register);
-EXPORT_SYMBOL(nf_conntrack_helper_unregister);
-EXPORT_SYMBOL(nf_ct_iterate_cleanup);
-EXPORT_SYMBOL(__nf_ct_refresh_acct);
-EXPORT_SYMBOL(nf_ct_protos);
-EXPORT_SYMBOL(__nf_ct_proto_find);
-EXPORT_SYMBOL(nf_ct_proto_find_get);
-EXPORT_SYMBOL(nf_ct_proto_put);
-EXPORT_SYMBOL(nf_ct_l3proto_find_get);
-EXPORT_SYMBOL(nf_ct_l3proto_put);
-EXPORT_SYMBOL(nf_ct_l3protos);
-EXPORT_SYMBOL(nf_conntrack_expect_alloc);
-EXPORT_SYMBOL(nf_conntrack_expect_put);
-EXPORT_SYMBOL(nf_conntrack_expect_related);
-EXPORT_SYMBOL(nf_conntrack_unexpect_related);
-EXPORT_SYMBOL(nf_conntrack_tuple_taken);
-EXPORT_SYMBOL(nf_conntrack_htable_size);
-EXPORT_SYMBOL(nf_conntrack_lock);
-EXPORT_SYMBOL(nf_conntrack_hash);
-EXPORT_SYMBOL(nf_conntrack_untracked);
-EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
-#ifdef CONFIG_IP_NF_NAT_NEEDED
-EXPORT_SYMBOL(nf_conntrack_tcp_update);
-#endif
-EXPORT_SYMBOL(__nf_conntrack_confirm);
-EXPORT_SYMBOL(nf_ct_get_tuple);
-EXPORT_SYMBOL(nf_ct_invert_tuple);
-EXPORT_SYMBOL(nf_conntrack_in);
-EXPORT_SYMBOL(__nf_conntrack_attach);
-EXPORT_SYMBOL(nf_conntrack_alloc);
-EXPORT_SYMBOL(nf_conntrack_free);
-EXPORT_SYMBOL(nf_conntrack_flush);
-EXPORT_SYMBOL(nf_ct_remove_expectations);
-EXPORT_SYMBOL(nf_ct_helper_find_get);
-EXPORT_SYMBOL(nf_ct_helper_put);
-EXPORT_SYMBOL(__nf_conntrack_helper_find_byname);
-EXPORT_SYMBOL(__nf_conntrack_find);
-EXPORT_SYMBOL(nf_ct_unlink_expect);
-EXPORT_SYMBOL(nf_conntrack_hash_insert);
-EXPORT_SYMBOL(__nf_conntrack_expect_find);
-EXPORT_SYMBOL(nf_conntrack_expect_find);
-EXPORT_SYMBOL(nf_conntrack_expect_list);
-#if defined(CONFIG_NF_CT_NETLINK) || \
- defined(CONFIG_NF_CT_NETLINK_MODULE)
-EXPORT_SYMBOL(nf_ct_port_tuple_to_nfattr);
-EXPORT_SYMBOL(nf_ct_port_nfattr_to_tuple);
-#endif
+EXPORT_SYMBOL_GPL(need_conntrack);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
new file mode 100644
index 00000000000..e68ab4fbd71
--- /dev/null
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -0,0 +1,153 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_tftp");
+MODULE_ALIAS_NFCT_HELPER("tftp");
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "Port numbers of TFTP servers");
+
+unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_tftp_hook);
+
+static int tftp_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ const struct tftphdr *tfh;
+ struct tftphdr _tftph;
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ unsigned int ret = NF_ACCEPT;
+ typeof(nf_nat_tftp_hook) nf_nat_tftp;
+
+ tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr),
+ sizeof(_tftph), &_tftph);
+ if (tfh == NULL)
+ return NF_ACCEPT;
+
+ switch (ntohs(tfh->opcode)) {
+ case TFTP_OPCODE_READ:
+ case TFTP_OPCODE_WRITE:
+ /* RRQ and WRQ works the same way */
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ return NF_DROP;
+ }
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_UDP, NULL, &tuple->dst.u.udp.port);
+
+ pr_debug("expect: ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
+ if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_tftp(skb, ctinfo, exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ break;
+ case TFTP_OPCODE_DATA:
+ case TFTP_OPCODE_ACK:
+ pr_debug("Data/ACK opcode\n");
+ break;
+ case TFTP_OPCODE_ERROR:
+ pr_debug("Error opcode\n");
+ break;
+ default:
+ pr_debug("Unknown opcode\n");
+ }
+ return ret;
+}
+
+static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy tftp_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
+
+static void nf_conntrack_tftp_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++)
+ nf_conntrack_helper_unregister(&tftp[i][j]);
+ }
+}
+
+static int __init nf_conntrack_tftp_init(void)
+{
+ int i, j, ret;
+
+ if (ports_c == 0)
+ ports[ports_c++] = TFTP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ memset(&tftp[i], 0, sizeof(tftp[i]));
+
+ tftp[i][0].tuple.src.l3num = AF_INET;
+ tftp[i][1].tuple.src.l3num = AF_INET6;
+ for (j = 0; j < 2; j++) {
+ tftp[i][j].tuple.dst.protonum = IPPROTO_UDP;
+ tftp[i][j].tuple.src.u.udp.port = htons(ports[i]);
+ tftp[i][j].expect_policy = &tftp_exp_policy;
+ tftp[i][j].me = THIS_MODULE;
+ tftp[i][j].help = tftp_help;
+
+ if (ports[i] == TFTP_PORT)
+ sprintf(tftp[i][j].name, "tftp");
+ else
+ sprintf(tftp[i][j].name, "tftp-%u", i);
+
+ ret = nf_conntrack_helper_register(&tftp[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_tftp: failed to register"
+ " helper for pf: %u port: %u\n",
+ tftp[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_tftp_fini();
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+module_init(nf_conntrack_tftp_init);
+module_exit(nf_conntrack_tftp_fini);
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
new file mode 100644
index 00000000000..93da609d9d2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -0,0 +1,51 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+struct ctnl_timeout *
+(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
+
+void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+
+static struct nf_ct_ext_type timeout_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_timeout),
+ .align = __alignof__(struct nf_conn_timeout),
+ .id = NF_CT_EXT_TIMEOUT,
+};
+
+int nf_conntrack_timeout_init(void)
+{
+ int ret = nf_ct_extend_register(&timeout_extend);
+ if (ret < 0)
+ pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
+ return ret;
+}
+
+void nf_conntrack_timeout_fini(void)
+{
+ nf_ct_extend_unregister(&timeout_extend);
+}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 00000000000..7a394df0deb
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,114 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static bool nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_tstamp),
+ .align = __alignof__(struct nf_conn_tstamp),
+ .id = NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_tstamp;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
+ if (!net->ct.tstamp_sysctl_header) {
+ printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_pernet_init(struct net *net)
+{
+ net->ct.sysctl_tstamp = nf_ct_tstamp;
+ return nf_conntrack_tstamp_init_sysctl(net);
+}
+
+void nf_conntrack_tstamp_pernet_fini(struct net *net)
+{
+ nf_conntrack_tstamp_fini_sysctl(net);
+}
+
+int nf_conntrack_tstamp_init(void)
+{
+ int ret;
+ ret = nf_ct_extend_register(&tstamp_extend);
+ if (ret < 0)
+ pr_err("nf_ct_tstamp: Unable to register extension\n");
+ return ret;
+}
+
+void nf_conntrack_tstamp_fini(void)
+{
+ nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 6bdee291061..61a3c927e63 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -1,39 +1,32 @@
#ifndef _NF_INTERNALS_H
#define _NF_INTERNALS_H
-#include <linux/config.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NETFILTER_DEBUG
-#define NFDEBUG(format, args...) printk(format , ## args)
+#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args)
#else
#define NFDEBUG(format, args...)
#endif
/* core.c */
-extern unsigned int nf_iterate(struct list_head *head,
- struct sk_buff **skb,
- int hook,
- const struct net_device *indev,
- const struct net_device *outdev,
- struct list_head **i,
- int (*okfn)(struct sk_buff *),
- int hook_thresh);
+unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
+ unsigned int hook, const struct net_device *indev,
+ const struct net_device *outdev,
+ struct nf_hook_ops **elemp,
+ int (*okfn)(struct sk_buff *), int hook_thresh);
/* nf_queue.c */
-extern int nf_queue(struct sk_buff **skb,
- struct list_head *elem,
- int pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- unsigned int queuenum);
-extern int __init netfilter_queue_init(void);
+int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf,
+ unsigned int hook, struct net_device *indev,
+ struct net_device *outdev, int (*okfn)(struct sk_buff *),
+ unsigned int queuenum);
+int __init netfilter_queue_init(void);
/* nf_log.c */
-extern int __init netfilter_log_init(void);
+int __init netfilter_log_init(void);
#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3e76bd0824a..85296d4eac0 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -1,4 +1,3 @@
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -7,95 +6,149 @@
#include <linux/netfilter.h>
#include <linux/seq_file.h>
#include <net/protocol.h>
+#include <net/netfilter/nf_log.h>
#include "nf_internals.h"
-/* Internal logging interface, which relies on the real
+/* Internal logging interface, which relies on the real
LOG target modules */
#define NF_LOG_PREFIXLEN 128
+#define NFLOGGER_NAME_LEN 64
-static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
-static DEFINE_SPINLOCK(nf_log_lock);
+static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
+static DEFINE_MUTEX(nf_log_mutex);
-/* return EBUSY if somebody else is registered, EEXIST if the same logger
- * is registred, 0 on success. */
-int nf_log_register(int pf, struct nf_logger *logger)
+static struct nf_logger *__find_logger(int pf, const char *str_logger)
{
- int ret = -EBUSY;
+ struct nf_logger *t;
- if (pf >= NPROTO)
- return -EINVAL;
+ list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) {
+ if (!strnicmp(str_logger, t->name, strlen(t->name)))
+ return t;
+ }
- /* Any setup of logging members must be done before
- * substituting pointer. */
- spin_lock(&nf_log_lock);
- if (!nf_logging[pf]) {
- rcu_assign_pointer(nf_logging[pf], logger);
- ret = 0;
- } else if (nf_logging[pf] == logger)
- ret = -EEXIST;
+ return NULL;
+}
- spin_unlock(&nf_log_lock);
- return ret;
-}
-EXPORT_SYMBOL(nf_log_register);
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+ const struct nf_logger *log;
-int nf_log_unregister_pf(int pf)
+ if (pf == NFPROTO_UNSPEC)
+ return;
+
+ mutex_lock(&nf_log_mutex);
+ log = rcu_dereference_protected(net->nf.nf_loggers[pf],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == NULL)
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
{
- if (pf >= NPROTO)
+ int i;
+ const struct nf_logger *log;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = rcu_dereference_protected(net->nf.nf_loggers[i],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == logger)
+ RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+ }
+ mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
+/* return EEXIST if the same logger is registered, 0 on success. */
+int nf_log_register(u_int8_t pf, struct nf_logger *logger)
+{
+ int i;
+
+ if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
return -EINVAL;
- spin_lock(&nf_log_lock);
- nf_logging[pf] = NULL;
- spin_unlock(&nf_log_lock);
+ for (i = 0; i < ARRAY_SIZE(logger->list); i++)
+ INIT_LIST_HEAD(&logger->list[i]);
+
+ mutex_lock(&nf_log_mutex);
- /* Give time to concurrent readers. */
- synchronize_net();
+ if (pf == NFPROTO_UNSPEC) {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+ list_add_tail(&(logger->list[i]), &(nf_loggers_l[i]));
+ } else {
+ /* register at end of list to honor first register win */
+ list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
+ }
+
+ mutex_unlock(&nf_log_mutex);
return 0;
}
-EXPORT_SYMBOL(nf_log_unregister_pf);
+EXPORT_SYMBOL(nf_log_register);
-void nf_log_unregister_logger(struct nf_logger *logger)
+void nf_log_unregister(struct nf_logger *logger)
{
int i;
- spin_lock(&nf_log_lock);
- for (i = 0; i < NPROTO; i++) {
- if (nf_logging[i] == logger)
- nf_logging[i] = NULL;
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ list_del(&logger->list[i]);
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_unregister);
+
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+ const struct nf_logger *logger)
+{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return -EINVAL;
+ mutex_lock(&nf_log_mutex);
+ if (__find_logger(pf, logger->name) == NULL) {
+ mutex_unlock(&nf_log_mutex);
+ return -ENOENT;
}
- spin_unlock(&nf_log_lock);
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+ mutex_unlock(&nf_log_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(nf_log_bind_pf);
- synchronize_net();
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
+{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return;
+ mutex_lock(&nf_log_mutex);
+ RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
+ mutex_unlock(&nf_log_mutex);
}
-EXPORT_SYMBOL(nf_log_unregister_logger);
+EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_log_packet(int pf,
+void nf_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- struct nf_loginfo *loginfo,
+ const struct nf_loginfo *loginfo,
const char *fmt, ...)
{
va_list args;
char prefix[NF_LOG_PREFIXLEN];
- struct nf_logger *logger;
-
+ const struct nf_logger *logger;
+
rcu_read_lock();
- logger = rcu_dereference(nf_logging[pf]);
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
if (logger) {
va_start(args, fmt);
vsnprintf(prefix, sizeof(prefix), fmt, args);
va_end(args);
- /* We must read logging before nf_logfn[pf] */
- logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
- } else if (net_ratelimit()) {
- printk(KERN_WARNING "nf_log_packet: can\'t log since "
- "no backend logging module loaded in! Please either "
- "load one, or disable logging explicitly\n");
+ logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
}
rcu_read_unlock();
}
@@ -104,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet);
#ifdef CONFIG_PROC_FS
static void *seq_start(struct seq_file *seq, loff_t *pos)
{
- rcu_read_lock();
+ struct net *net = seq_file_net(seq);
- if (*pos >= NPROTO)
+ mutex_lock(&nf_log_mutex);
+
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -114,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- if (*pos >= NPROTO)
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -124,23 +181,43 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
static void seq_stop(struct seq_file *s, void *v)
{
- rcu_read_unlock();
+ mutex_unlock(&nf_log_mutex);
}
static int seq_show(struct seq_file *s, void *v)
{
loff_t *pos = v;
const struct nf_logger *logger;
+ struct nf_logger *t;
+ int ret;
+ struct net *net = seq_file_net(s);
- logger = rcu_dereference(nf_logging[*pos]);
+ logger = rcu_dereference_protected(net->nf.nf_loggers[*pos],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
- return seq_printf(s, "%2lld NONE\n", *pos);
-
- return seq_printf(s, "%2lld %s\n", *pos, logger->name);
+ ret = seq_printf(s, "%2lld NONE (", *pos);
+ else
+ ret = seq_printf(s, "%2lld %s (", *pos, logger->name);
+
+ if (ret < 0)
+ return ret;
+
+ list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) {
+ ret = seq_printf(s, "%s", t->name);
+ if (ret < 0)
+ return ret;
+ if (&t->list[*pos] != nf_loggers_l[*pos].prev) {
+ ret = seq_printf(s, ",");
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return seq_printf(s, ")\n");
}
-static struct seq_operations nflog_seq_ops = {
+static const struct seq_operations nflog_seq_ops = {
.start = seq_start,
.next = seq_next,
.stop = seq_stop,
@@ -149,30 +226,177 @@ static struct seq_operations nflog_seq_ops = {
static int nflog_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &nflog_seq_ops);
+ return seq_open_net(inode, file, &nflog_seq_ops,
+ sizeof(struct seq_net_private));
}
-static struct file_operations nflog_file_ops = {
+static const struct file_operations nflog_file_ops = {
.owner = THIS_MODULE,
.open = nflog_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
+
#endif /* PROC_FS */
+#ifdef CONFIG_SYSCTL
+static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
-int __init netfilter_log_init(void)
+static int nf_log_proc_dostring(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ const struct nf_logger *logger;
+ char buf[NFLOGGER_NAME_LEN];
+ size_t size = *lenp;
+ int r = 0;
+ int tindex = (unsigned long)table->extra1;
+ struct net *net = current->nsproxy->net_ns;
+
+ if (write) {
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, buffer, size))
+ return -EFAULT;
+
+ if (!strcmp(buf, "NONE")) {
+ nf_log_unbind_pf(net, tindex);
+ return 0;
+ }
+ mutex_lock(&nf_log_mutex);
+ logger = __find_logger(tindex, buf);
+ if (logger == NULL) {
+ mutex_unlock(&nf_log_mutex);
+ return -ENOENT;
+ }
+ rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
+ mutex_unlock(&nf_log_mutex);
+ } else {
+ mutex_lock(&nf_log_mutex);
+ logger = rcu_dereference_protected(net->nf.nf_loggers[tindex],
+ lockdep_is_held(&nf_log_mutex));
+ if (!logger)
+ table->data = "NONE";
+ else
+ table->data = logger->name;
+ r = proc_dostring(table, write, buffer, lenp, ppos);
+ mutex_unlock(&nf_log_mutex);
+ }
+
+ return r;
+}
+
+static int netfilter_log_sysctl_init(struct net *net)
{
+ int i;
+ struct ctl_table *table;
+
+ table = nf_log_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_log_sysctl_table,
+ sizeof(nf_log_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ } else {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ snprintf(nf_log_sysctl_fnames[i],
+ 3, "%d", i);
+ nf_log_sysctl_table[i].procname =
+ nf_log_sysctl_fnames[i];
+ nf_log_sysctl_table[i].data = NULL;
+ nf_log_sysctl_table[i].maxlen =
+ NFLOGGER_NAME_LEN * sizeof(char);
+ nf_log_sysctl_table[i].mode = 0644;
+ nf_log_sysctl_table[i].proc_handler =
+ nf_log_proc_dostring;
+ nf_log_sysctl_table[i].extra1 =
+ (void *)(unsigned long) i;
+ }
+ }
+
+ net->nf.nf_log_dir_header = register_net_sysctl(net,
+ "net/netfilter/nf_log",
+ table);
+ if (!net->nf.nf_log_dir_header)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf.nf_log_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+#else
+static int netfilter_log_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+static int __net_init nf_log_net_init(struct net *net)
+{
+ int ret = -ENOMEM;
+
#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *pde;
+ if (!proc_create("nf_log", S_IRUGO,
+ net->nf.proc_netfilter, &nflog_file_ops))
+ return ret;
+#endif
+ ret = netfilter_log_sysctl_init(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
- pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
- if (!pde)
- return -1;
+out_sysctl:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+ return ret;
+}
- pde->proc_fops = &nflog_file_ops;
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+ netfilter_log_sysctl_exit(net);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
#endif
+}
+
+static struct pernet_operations nf_log_net_ops = {
+ .init = nf_log_net_init,
+ .exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+ int i, ret;
+
+ ret = register_pernet_subsys(&nf_log_net_ops);
+ if (ret < 0)
+ return ret;
+
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+ INIT_LIST_HEAD(&(nf_loggers_l[i]));
+
return 0;
}
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
new file mode 100644
index 00000000000..eb772380a20
--- /dev/null
+++ b/net/netfilter/nf_nat_amanda.c
@@ -0,0 +1,90 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_amanda");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("65535")];
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Connection comes from client. */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_ORIGINAL;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one (ie. same IP: it will be TCP and master is UDP). */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
+ break;
+ else if (res != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
+ return NF_DROP;
+ }
+
+ sprintf(buffer, "%u", port);
+ ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, exp->master, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+ return ret;
+}
+
+static void __exit nf_nat_amanda_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_amanda_init(void)
+{
+ BUG_ON(nf_nat_amanda_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_amanda_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_amanda_init);
+module_exit(nf_nat_amanda_fini);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
new file mode 100644
index 00000000000..a49907b1dab
--- /dev/null
+++ b/net/netfilter/nf_nat_core.c
@@ -0,0 +1,898 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <net/xfrm.h>
+#include <linux/jhash.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_nat.h>
+
+static DEFINE_SPINLOCK(nf_nat_lock);
+
+static DEFINE_MUTEX(nf_nat_proto_mutex);
+static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+
+
+inline const struct nf_nat_l3proto *
+__nf_nat_l3proto_find(u8 family)
+{
+ return rcu_dereference(nf_nat_l3protos[family]);
+}
+
+inline const struct nf_nat_l4proto *
+__nf_nat_l4proto_find(u8 family, u8 protonum)
+{
+ return rcu_dereference(nf_nat_l4protos[family][protonum]);
+}
+EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
+
+#ifdef CONFIG_XFRM
+static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ unsigned long statusbit;
+ u8 family;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(family);
+ if (l3proto == NULL)
+ goto out;
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir == IP_CT_DIR_ORIGINAL)
+ statusbit = IPS_DST_NAT;
+ else
+ statusbit = IPS_SRC_NAT;
+
+ l3proto->decode_session(skb, ct, dir, statusbit, fl);
+out:
+ rcu_read_unlock();
+}
+
+int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
+{
+ struct flowi fl;
+ unsigned int hh_len;
+ struct dst_entry *dst;
+ int err;
+
+ err = xfrm_decode_session(skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ dst_hold(dst);
+
+ dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(nf_xfrm_me_harder);
+#endif /* CONFIG_XFRM */
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ unsigned int hash;
+
+ /* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+ return ((u64)hash * net->ct.nat_htable_size) >> 32;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ * incoming ones. NAT means they don't have a fixed mapping,
+ * so we invert the tuple and look for the incoming reply.
+ *
+ * We could keep a separate hash if this proves too slow.
+ */
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuplepr(&reply, tuple);
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+EXPORT_SYMBOL(nf_nat_used_tuple);
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range.
+ */
+static int in_range(const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range)
+{
+ /* If we are supposed to map IPs, then we must be in the
+ * range specified, otherwise let this drag us onto a new src IP.
+ */
+ if (range->flags & NF_NAT_RANGE_MAP_IPS &&
+ !l3proto->in_range(tuple, range))
+ return 0;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
+ l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
+ &range->min_proto, &range->max_proto))
+ return 1;
+
+ return 0;
+}
+
+static inline int
+same_src(const struct nf_conn *ct,
+ const struct nf_conntrack_tuple *tuple)
+{
+ const struct nf_conntrack_tuple *t;
+
+ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ return (t->dst.protonum == tuple->dst.protonum &&
+ nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
+ t->src.u.all == tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(struct net *net, u16 zone,
+ const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *result,
+ const struct nf_nat_range *range)
+{
+ unsigned int h = hash_by_src(net, zone, tuple);
+ const struct nf_conn_nat *nat;
+ const struct nf_conn *ct;
+
+ hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ ct = nat->ct;
+ if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ * 1-65535, we don't do pro-rata allocation based on ports; we choose
+ * the ip with the lowest src-ip/dst-ip/proto usage.
+ */
+static void
+find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ const struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ union nf_inet_addr *var_ipp;
+ unsigned int i, max;
+ /* Host order */
+ u32 minip, maxip, j, dist;
+ bool full_range;
+
+ /* No IP mapping? Do nothing. */
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ var_ipp = &tuple->src.u3;
+ else
+ var_ipp = &tuple->dst.u3;
+
+ /* Fast path: only one choice. */
+ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
+ *var_ipp = range->min_addr;
+ return;
+ }
+
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
+ else
+ max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
+
+ /* Hashing source and destination IPs gives a fairly even
+ * spread in practice (if there are a small number of IPs
+ * involved, there usually aren't that many connections
+ * anyway). The consistency means that servers see the same
+ * client coming from the same IP (some Internet Banking sites
+ * like this), even across reboots.
+ */
+ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
+ range->flags & NF_NAT_RANGE_PERSISTENT ?
+ 0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+
+ full_range = false;
+ for (i = 0; i <= max; i++) {
+ /* If first bytes of the address are at the maximum, use the
+ * distance. Otherwise use the full range.
+ */
+ if (!full_range) {
+ minip = ntohl((__force __be32)range->min_addr.all[i]);
+ maxip = ntohl((__force __be32)range->max_addr.all[i]);
+ dist = maxip - minip + 1;
+ } else {
+ minip = 0;
+ dist = ~0;
+ }
+
+ var_ipp->all[i] = (__force __u32)
+ htonl(minip + (((u64)j * dist) >> 32));
+ if (var_ipp->all[i] != range->max_addr.all[i])
+ full_range = true;
+
+ if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
+ j ^= (__force u32)tuple->dst.u3.all[i];
+ }
+}
+
+/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
+ * we change the source to map into the range. For NF_INET_PRE_ROUTING
+ * and NF_INET_LOCAL_OUT, we change the destination to map into the
+ * range. It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig_tuple,
+ const struct nf_nat_range *range,
+ struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ u16 zone = nf_ct_zone(ct);
+
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
+ l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
+ orig_tuple->dst.protonum);
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ * and that same mapping gives a unique tuple within the given
+ * range, use that.
+ *
+ * This is only required for source (ie. NAT/masq) mappings.
+ * So far, we don't do local source mappings, so multiple
+ * manips not an issue.
+ */
+ if (maniptype == NF_NAT_MANIP_SRC &&
+ !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ /* try the original tuple first */
+ if (in_range(l3proto, l4proto, orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ goto out;
+ }
+ } else if (find_appropriate_src(net, zone, l3proto, l4proto,
+ orig_tuple, tuple, range)) {
+ pr_debug("get_unique_tuple: Found current src map\n");
+ if (!nf_nat_used_tuple(tuple, ct))
+ goto out;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given range */
+ *tuple = *orig_tuple;
+ find_best_ips_proto(zone, tuple, range, ct, maniptype);
+
+ /* 3) The per-protocol part of the manip is made to map into
+ * the range to make a unique tuple.
+ */
+
+ /* Only bother mapping if it's not already in range and unique */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (l4proto->in_range(tuple, maniptype,
+ &range->min_proto,
+ &range->max_proto) &&
+ (range->min_proto.all == range->max_proto.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
+
+ /* Last change: get protocol to try to obtain unique tuple. */
+ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
+out:
+ rcu_read_unlock();
+}
+
+struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ if (nat)
+ return nat;
+
+ if (!nf_ct_is_confirmed(ct))
+ nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+
+ return nat;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
+
+unsigned int
+nf_nat_setup_info(struct nf_conn *ct,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_tuple curr_tuple, new_tuple;
+ struct nf_conn_nat *nat;
+
+ /* nat helper or nfctnetlink also setup binding */
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
+ maniptype == NF_NAT_MANIP_DST);
+ BUG_ON(nf_nat_initialized(ct, maniptype));
+
+ /* What we've got will look like inverse of reply. Normally
+ * this is what is in the conntrack, except for prior
+ * manipulations (future optimization: if num_manips == 0,
+ * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+ */
+ nf_ct_invert_tuplepr(&curr_tuple,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+
+ if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+ struct nf_conntrack_tuple reply;
+
+ /* Alter conntrack table so will recognize replies. */
+ nf_ct_invert_tuplepr(&reply, &new_tuple);
+ nf_conntrack_alter_reply(ct, &reply);
+
+ /* Non-atomic: we own this at the moment. */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ct->status |= IPS_SRC_NAT;
+ else
+ ct->status |= IPS_DST_NAT;
+
+ if (nfct_help(ct))
+ nfct_seqadj_ext_add(ct);
+ }
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ unsigned int srchash;
+
+ srchash = hash_by_src(net, nf_ct_zone(ct),
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate extension aera */
+ nat = nfct_nat(ct);
+ nat->ct = ct;
+ hlist_add_head_rcu(&nat->bysource,
+ &net->ct.nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
+ }
+
+ /* It's done. */
+ if (maniptype == NF_NAT_MANIP_DST)
+ ct->status |= IPS_DST_NAT_DONE;
+ else
+ ct->status |= IPS_SRC_NAT_DONE;
+
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL(nf_nat_setup_info);
+
+static unsigned int
+__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
+{
+ /* Force range to this IP; let proto decide mapping for
+ * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ * Use reply in case it's already been mangled (eg local packet).
+ */
+ union nf_inet_addr ip =
+ (manip == NF_NAT_MANIP_SRC ?
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
+ struct nf_nat_range range = {
+ .flags = NF_NAT_RANGE_MAP_IPS,
+ .min_addr = ip,
+ .max_addr = ip,
+ };
+ return nf_nat_setup_info(ct, &range, manip);
+}
+
+unsigned int
+nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+ return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
+}
+EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
+
+/* Do packet manipulations according to nf_nat_setup_info. */
+unsigned int nf_nat_packet(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned long statusbit;
+ enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+ if (mtype == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ /* Non-atomic: these bits don't change. */
+ if (ct->status & statusbit) {
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ l3proto = __nf_nat_l3proto_find(target.src.l3num);
+ l4proto = __nf_nat_l4proto_find(target.src.l3num,
+ target.dst.protonum);
+ if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+struct nf_nat_proto_clean {
+ u8 l3proto;
+ u8 l4proto;
+};
+
+/* kill conntracks with affected NAT section */
+static int nf_nat_proto_remove(struct nf_conn *i, void *data)
+{
+ const struct nf_nat_proto_clean *clean = data;
+ struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+
+ if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
+ (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
+ return 0;
+
+ return i->status & IPS_NAT_MASK ? 1 : 0;
+}
+
+static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+
+ if (nf_nat_proto_remove(ct, data))
+ return 1;
+
+ if (!nat || !nat->ct)
+ return 0;
+
+ /* This netns is being destroyed, and conntrack has nat null binding.
+ * Remove it from bysource hash, as the table will be freed soon.
+ *
+ * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
+ * will delete entry from already-freed table.
+ */
+ if (!del_timer(&ct->timeout))
+ return 1;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ ct->status &= ~IPS_NAT_DONE_MASK;
+ nat->ct = NULL;
+ spin_unlock_bh(&nf_nat_lock);
+
+ add_timer(&ct->timeout);
+
+ /* don't delete conntrack. Although that would make things a lot
+ * simpler, we'd end up flushing all conntracks on nat rmmod.
+ */
+ return 0;
+}
+
+static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ .l4proto = l4proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+static void nf_nat_l3proto_clean(u8 l3proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+/* Protocol registration. */
+int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ const struct nf_nat_l4proto **l4protos;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (nf_nat_l4protos[l3proto] == NULL) {
+ l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *),
+ GFP_KERNEL);
+ if (l4protos == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < IPPROTO_MAX; i++)
+ RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
+
+ /* Before making proto_array visible to lockless readers,
+ * we must make sure its content is committed to memory.
+ */
+ smp_wmb();
+
+ nf_nat_l4protos[l3proto] = l4protos;
+ }
+
+ if (rcu_dereference_protected(
+ nf_nat_l4protos[l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_nat_proto_mutex)
+ ) != &nf_nat_l4proto_unknown) {
+ ret = -EBUSY;
+ goto out;
+ }
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
+ out:
+ mutex_unlock(&nf_nat_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
+
+/* No one stores the protocol anywhere; simply delete it. */
+void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
+ &nf_nat_l4proto_unknown);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
+
+int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
+{
+ int err;
+
+ err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
+ if (err < 0)
+ return err;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
+ &nf_nat_l4proto_tcp);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
+ &nf_nat_l4proto_udp);
+ mutex_unlock(&nf_nat_proto_mutex);
+
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
+
+void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l3proto_clean(l3proto->l3proto);
+ nf_ct_l3proto_module_put(l3proto->l3proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
+
+/* No one using conntrack by the time this called. */
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
+
+ if (nat == NULL || nat->ct == NULL)
+ return;
+
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static void nf_nat_move_storage(void *new, void *old)
+{
+ struct nf_conn_nat *new_nat = new;
+ struct nf_conn_nat *old_nat = old;
+ struct nf_conn *ct = old_nat->ct;
+
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
+ return;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static struct nf_ct_ext_type nat_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_nat),
+ .align = __alignof__(struct nf_conn_nat),
+ .destroy = nf_nat_cleanup_conntrack,
+ .move = nf_nat_move_storage,
+ .id = NF_CT_EXT_NAT,
+ .flags = NF_CT_EXT_F_PREALLOC,
+};
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
+ [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
+ [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int nfnetlink_parse_nat_proto(struct nlattr *attr,
+ const struct nf_conn *ct,
+ struct nf_nat_range *range)
+{
+ struct nlattr *tb[CTA_PROTONAT_MAX+1];
+ const struct nf_nat_l4proto *l4proto;
+ int err;
+
+ err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
+ if (err < 0)
+ return err;
+
+ l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->nlattr_to_range)
+ err = l4proto->nlattr_to_range(tb, range);
+
+ return err;
+}
+
+static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
+ [CTA_NAT_V4_MINIP] = { .type = NLA_U32 },
+ [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 },
+ [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_PROTO] = { .type = NLA_NESTED },
+};
+
+static int
+nfnetlink_parse_nat(const struct nlattr *nat,
+ const struct nf_conn *ct, struct nf_nat_range *range,
+ const struct nf_nat_l3proto *l3proto)
+{
+ struct nlattr *tb[CTA_NAT_MAX+1];
+ int err;
+
+ memset(range, 0, sizeof(*range));
+
+ err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = l3proto->nlattr_to_range(tb, range);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_NAT_PROTO])
+ return 0;
+
+ return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
+}
+
+/* This function is called under rcu_read_lock() */
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ struct nf_nat_range range;
+ const struct nf_nat_l3proto *l3proto;
+ int err;
+
+ /* Should not happen, restricted to creating new conntracks
+ * via ctnetlink.
+ */
+ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
+ return -EEXIST;
+
+ /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
+ * attach the null binding, otherwise this may oops.
+ */
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ if (l3proto == NULL)
+ return -EAGAIN;
+
+ /* No NAT information has been passed, allocate the null-binding */
+ if (attr == NULL)
+ return __nf_nat_alloc_null_binding(ct, manip);
+
+ err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
+ if (err < 0)
+ return err;
+
+ return nf_nat_setup_info(ct, &range, manip);
+}
+#else
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static int __net_init nf_nat_net_init(struct net *net)
+{
+ /* Leave them the same for the moment. */
+ net->ct.nat_htable_size = net->ct.htable_size;
+ net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
+ if (!net->ct.nat_bysource)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit nf_nat_net_exit(struct net *net)
+{
+ struct nf_nat_proto_clean clean = {};
+
+ nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
+ synchronize_rcu();
+ nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
+}
+
+static struct pernet_operations nf_nat_net_ops = {
+ .init = nf_nat_net_init,
+ .exit = nf_nat_net_exit,
+};
+
+static struct nf_ct_helper_expectfn follow_master_nat = {
+ .name = "nat-follow-master",
+ .expectfn = nf_nat_follow_master,
+};
+
+static int __init nf_nat_init(void)
+{
+ int ret;
+
+ ret = nf_ct_extend_register(&nat_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&nf_nat_net_ops);
+ if (ret < 0)
+ goto cleanup_extend;
+
+ nf_ct_helper_expectfn_register(&follow_master_nat);
+
+ /* Initialize fake conntrack so that NAT will skip it */
+ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
+
+ BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
+ nfnetlink_parse_nat_setup);
+#ifdef CONFIG_XFRM
+ BUG_ON(nf_nat_decode_session_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
+#endif
+ return 0;
+
+ cleanup_extend:
+ nf_ct_extend_unregister(&nat_extend);
+ return ret;
+}
+
+static void __exit nf_nat_cleanup(void)
+{
+ unsigned int i;
+
+ unregister_pernet_subsys(&nf_nat_net_ops);
+ nf_ct_extend_unregister(&nat_extend);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
+#ifdef CONFIG_XFRM
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
+#endif
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ kfree(nf_nat_l4protos[i]);
+ synchronize_net();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(nf_nat_init);
+module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
new file mode 100644
index 00000000000..e84a578dbe3
--- /dev/null
+++ b/net/netfilter/nf_nat_ftp.c
@@ -0,0 +1,146 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+MODULE_ALIAS("ip_nat_ftp");
+
+/* FIXME: Time out? --RR */
+
+static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type,
+ char *buffer, size_t buflen,
+ union nf_inet_addr *addr, u16 port)
+{
+ switch (type) {
+ case NF_CT_FTP_PORT:
+ case NF_CT_FTP_PASV:
+ return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&addr->ip)[0],
+ ((unsigned char *)&addr->ip)[1],
+ ((unsigned char *)&addr->ip)[2],
+ ((unsigned char *)&addr->ip)[3],
+ port >> 8,
+ port & 0xFF);
+ case NF_CT_FTP_EPRT:
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return snprintf(buffer, buflen, "|1|%pI4|%u|",
+ &addr->ip, port);
+ else
+ return snprintf(buffer, buflen, "|2|%pI6|%u|",
+ &addr->ip6, port);
+ case NF_CT_FTP_EPSV:
+ return snprintf(buffer, buflen, "|||%u|", port);
+ }
+
+ return 0;
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_ftp(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ enum nf_ct_ftp_type type,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conn *ct = exp->master;
+ char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+ /* Connection will come from wherever this packet goes, hence !dir */
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = !dir;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one. */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer),
+ &newaddr, port);
+ if (!buflen)
+ goto out;
+
+ pr_debug("calling nf_nat_mangle_tcp_packet\n");
+
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, buflen))
+ goto out;
+
+ return NF_ACCEPT;
+
+out:
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static void __exit nf_nat_ftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_ftp_init(void)
+{
+ BUG_ON(nf_nat_ftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_ftp_init);
+module_exit(nf_nat_ftp_fini);
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
new file mode 100644
index 00000000000..2840abb5bb9
--- /dev/null
+++ b/net/netfilter/nf_nat_helper.c
@@ -0,0 +1,212 @@
+/* nf_nat_helper.c - generic support functions for NAT helpers
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+ unsigned int dataoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ unsigned char *data;
+
+ BUG_ON(skb_is_nonlinear(skb));
+ data = skb_network_header(skb) + dataoff;
+
+ /* move post-replacement */
+ memmove(data + match_offset + rep_len,
+ data + match_offset + match_len,
+ skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff +
+ match_offset + match_len));
+
+ /* insert data from buffer */
+ memcpy(data + match_offset, rep_buffer, rep_len);
+
+ /* update skb info */
+ if (rep_len > match_len) {
+ pr_debug("nf_nat_mangle_packet: Extending packet by "
+ "%u from %u bytes\n", rep_len - match_len, skb->len);
+ skb_put(skb, rep_len - match_len);
+ } else {
+ pr_debug("nf_nat_mangle_packet: Shrinking packet from "
+ "%u from %u bytes\n", match_len - rep_len, skb->len);
+ __skb_trim(skb, skb->len + rep_len - match_len);
+ }
+
+ if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+ /* fix IP hdr checksum information */
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+ } else
+ ipv6_hdr(skb)->payload_len =
+ htons(skb->len - sizeof(struct ipv6hdr));
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
+{
+ if (skb->len + extra > 65535)
+ return 0;
+
+ if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
+ return 0;
+
+ return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len, bool adjust)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct tcphdr *tcph;
+ int oldlen, datalen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ SKB_LINEAR_ASSERT(skb);
+
+ tcph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + tcph->doff*4,
+ match_offset, match_len, rep_buffer, rep_len);
+
+ datalen = skb->len - protoff;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check,
+ datalen, oldlen);
+
+ if (adjust && rep_len != match_len)
+ nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
+ (int)rep_len - (int)match_len);
+
+ return 1;
+}
+EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
+ * should be fairly easy to do.
+ */
+int
+nf_nat_mangle_udp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct udphdr *udph;
+ int datalen, oldlen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ udph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + sizeof(*udph),
+ match_offset, match_len, rep_buffer, rep_len);
+
+ /* update the length of the UDP packet */
+ datalen = skb->len - protoff;
+ udph->len = htons(datalen);
+
+ /* fix udp checksum if udp checksum was previously calculated */
+ if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
+ return 1;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check,
+ datalen, oldlen);
+
+ return 1;
+}
+EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void nf_nat_follow_master(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
new file mode 100644
index 00000000000..1fb2258c353
--- /dev/null
+++ b/net/netfilter/nf_nat_irc.c
@@ -0,0 +1,119 @@
+/* IRC extension for TCP NAT alteration.
+ *
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_irc");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("4294967296 65635")];
+ struct nf_conn *ct = exp->master;
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Reply comes from server. */
+ newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3;
+
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+ * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+ * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+ *
+ * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+ * 255.255.255.255==4294967296, 10 digits)
+ * P: bound port (min 1 d, max 5d (65635))
+ * F: filename (min 1 d )
+ * S: size (min 1 d )
+ * 0x01, \n: terminators
+ */
+ /* AAA = "us", ie. where server normally talks to. */
+ snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port);
+ pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
+ buffer, &newaddr.ip, port);
+
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+
+ return ret;
+}
+
+static void __exit nf_nat_irc_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_irc_init(void)
+{
+ BUG_ON(nf_nat_irc_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_irc_hook, help);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_irc_init);
+module_exit(nf_nat_irc_fini);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
new file mode 100644
index 00000000000..83a72a235ca
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -0,0 +1,114 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/netfilter.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ __be16 port;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ port = tuple->src.u.all;
+ else
+ port = tuple->dst.u.all;
+
+ return ntohs(port) >= ntohs(min->all) &&
+ ntohs(port) <= ntohs(max->all);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
+
+void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct,
+ u16 *rover)
+{
+ unsigned int range_size, min, i;
+ __be16 *portptr;
+ u_int16_t off;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.all;
+ else
+ portptr = &tuple->dst.u.all;
+
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == NF_NAT_MANIP_DST)
+ return;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr) < 512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min_proto.all);
+ range_size = ntohs(range->max_proto.all) - min + 1;
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
+ off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
+ ? tuple->dst.u.all
+ : tuple->src.u.all);
+ } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
+ off = prandom_u32();
+ } else {
+ off = *rover;
+ }
+
+ for (i = 0; ; ++off) {
+ *portptr = htons(min + off % range_size);
+ if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+ continue;
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+ *rover = off;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_PROTONAT_PORT_MIN]) {
+ range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
+ range->max_proto.all = range->min_proto.all;
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ if (tb[CTA_PROTONAT_PORT_MAX]) {
+ range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range);
+#endif
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
new file mode 100644
index 00000000000..c8be2cdac0b
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -0,0 +1,116 @@
+/*
+ * DCCP NAT protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t dccp_port_rover;
+
+static void
+dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &dccp_port_rover);
+}
+
+static bool
+dccp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct dccp_hdr *hdr;
+ __be16 *portptr, oldport, newport;
+ int hdrsize = 8; /* DCCP connection tracking guarantees this much */
+
+ if (skb->len >= hdroff + sizeof(struct dccp_hdr))
+ hdrsize = sizeof(struct dccp_hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct dccp_hdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ newport = tuple->src.u.dccp.port;
+ portptr = &hdr->dccph_sport;
+ } else {
+ newport = tuple->dst.u.dccp.port;
+ portptr = &hdr->dccph_dport;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
+ 0);
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+ .l4proto = IPPROTO_DCCP,
+ .manip_pkt = dccp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = dccp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_dccp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_dccp_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+
+}
+
+module_init(nf_nat_proto_dccp_init);
+module_exit(nf_nat_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP NAT protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
new file mode 100644
index 00000000000..754536f2c67
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sctp.h>
+#include <linux/module.h>
+#include <net/sctp/checksum.h>
+
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t nf_sctp_port_rover;
+
+static void
+sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &nf_sctp_port_rover);
+}
+
+static bool
+sctp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ sctp_sctphdr_t *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct sctphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ hdr->source = tuple->src.u.sctp.port;
+ } else {
+ /* Get rid of dst port */
+ hdr->dest = tuple->dst.u.sctp.port;
+ }
+
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+ .l4proto = IPPROTO_SCTP,
+ .manip_pkt = sctp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = sctp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_sctp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_sctp_exit(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+}
+
+module_init(nf_nat_proto_sctp_init);
+module_exit(nf_nat_proto_sctp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SCTP NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
new file mode 100644
index 00000000000..83ec8a6e4c3
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -0,0 +1,85 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static u16 tcp_port_rover;
+
+static void
+tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &tcp_port_rover);
+}
+
+static bool
+tcp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct tcphdr *hdr;
+ __be16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if (skb->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct tcphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_tcp = {
+ .l4proto = IPPROTO_TCP,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = tcp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
new file mode 100644
index 00000000000..7df613fb34a
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -0,0 +1,76 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udp_port_rover;
+
+static void
+udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udp_port_rover);
+}
+
+static bool
+udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+ if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ l3proto->csum_update(skb, iphdroff, &hdr->check,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+ 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+ }
+ *portptr = newport;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udp = {
+ .l4proto = IPPROTO_UDP,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
new file mode 100644
index 00000000000..776a0d1317b
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -0,0 +1,106 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udplite_port_rover;
+
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udplite_port_rover);
+}
+
+static bool
+udplite_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of source port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+
+ *portptr = newport;
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+ .l4proto = IPPROTO_UDPLITE,
+ .manip_pkt = udplite_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udplite_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_udplite_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_udplite_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+}
+
+module_init(nf_nat_proto_udplite_init);
+module_exit(nf_nat_proto_udplite_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
new file mode 100644
index 00000000000..6e494d58441
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -0,0 +1,54 @@
+/* The "unknown" protocol. This is what is used for protocols we
+ * don't understand. It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type manip_type,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return true;
+}
+
+static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ /* Sorry: we can't help you; if it's not unique, we can't frob
+ * anything.
+ */
+ return;
+}
+
+static bool
+unknown_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_unknown = {
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+};
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
new file mode 100644
index 00000000000..b4d691db955
--- /dev/null
+++ b/net/netfilter/nf_nat_sip.c
@@ -0,0 +1,653 @@
+/* SIP extension for NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+MODULE_ALIAS("ip_nat_sip");
+
+
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ const char *buffer, unsigned int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct tcphdr *th;
+ unsigned int baseoff;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP) {
+ th = (struct tcphdr *)(skb->data + protoff);
+ baseoff = protoff + th->doff * 4;
+ matchoff += dataoff - baseoff;
+
+ if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen, false))
+ return 0;
+ } else {
+ baseoff = protoff + sizeof(struct udphdr);
+ matchoff += dataoff - baseoff;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen))
+ return 0;
+ }
+
+ /* Reload data pointer and adjust datalen value */
+ *dptr = skb->data + dataoff;
+ *datalen += buflen - matchlen;
+ return 1;
+}
+
+static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, bool delim)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4", &addr->ip);
+ else {
+ if (delim)
+ return sprintf(buffer, "[%pI6c]", &addr->ip6);
+ else
+ return sprintf(buffer, "%pI6c", &addr->ip6);
+ }
+}
+
+static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, u16 port)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+ else
+ return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+}
+
+static int map_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ union nf_inet_addr *addr, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+ union nf_inet_addr newaddr;
+ __be16 newport;
+
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) &&
+ ct->tuplehash[dir].tuple.src.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+ } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) &&
+ ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.src.u3;
+ newport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[!dir].tuple.src.u.udp.port;
+ } else
+ return 1;
+
+ if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
+ return 1;
+
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int map_sip_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ enum sip_header_types type)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
+ &matchoff, &matchlen, &addr, &port) <= 0)
+ return 1;
+ return map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port);
+}
+
+static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ unsigned int coff, matchoff, matchlen;
+ enum sip_header_types hdr;
+ union nf_inet_addr addr;
+ __be16 port;
+ int request, in_header;
+
+ /* Basic rules: requests and responses. */
+ if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
+ if (ct_sip_parse_request(ct, *dptr, *datalen,
+ &matchoff, &matchlen,
+ &addr, &port) > 0 &&
+ !map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP message");
+ return NF_DROP;
+ }
+ request = 1;
+ } else
+ request = 0;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP)
+ hdr = SIP_HDR_VIA_TCP;
+ else
+ hdr = SIP_HDR_VIA_UDP;
+
+ /* Translate topmost Via header and parameters */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ hdr, NULL, &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ unsigned int olen, matchend, poff, plen, buflen, n;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+
+ /* We're only interested in headers related to this
+ * connection */
+ if (request) {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.src.u3) ||
+ port != ct->tuplehash[dir].tuple.src.u.udp.port)
+ goto next;
+ } else {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.dst.u3) ||
+ port != ct->tuplehash[dir].tuple.dst.u.udp.port)
+ goto next;
+ }
+
+ olen = *datalen;
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle Via header");
+ return NF_DROP;
+ }
+
+ matchend = matchoff + matchlen + *datalen - olen;
+
+ /* The maddr= parameter (RFC 2361) specifies where to send
+ * the reply. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "maddr=", &poff, &plen,
+ &addr, true) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ true);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle maddr");
+ return NF_DROP;
+ }
+ }
+
+ /* The received= parameter (RFC 2361) contains the address
+ * from which the server received the request. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "received=", &poff, &plen,
+ &addr, false) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ false);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle received");
+ return NF_DROP;
+ }
+ }
+
+ /* The rport= parameter (RFC 3581) contains the port number
+ * from which the server received the request. */
+ if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
+ "rport=", &poff, &plen,
+ &n) > 0 &&
+ htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
+ htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
+ __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
+ buflen = sprintf(buffer, "%u", ntohs(p));
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle rport");
+ return NF_DROP;
+ }
+ }
+ }
+
+next:
+ /* Translate Contact headers */
+ coff = 0;
+ in_header = 0;
+ while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+ SIP_HDR_CONTACT, &in_header,
+ &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen,
+ &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle contact");
+ return NF_DROP;
+ }
+ }
+
+ if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+ !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to");
+ return NF_DROP;
+ }
+
+ /* Mangle destination port for Cisco phones, then fix up checksums */
+ if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
+ struct udphdr *uh;
+
+ if (!skb_make_writable(skb, skb->len)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+
+ uh = (void *)skb->data + protoff;
+ uh->dest = ct_sip_info->forced_dport;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff,
+ 0, 0, NULL, 0)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+ }
+
+ return NF_ACCEPT;
+}
+
+static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
+ s16 off)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
+ return;
+
+ th = (struct tcphdr *)(skb->data + protoff);
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+
+/* Handles expected signalling connections and media streams */
+static void nf_nat_sip_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr = exp->saved_addr;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+
+ /* Change src to where master sends to, but only if the connection
+ * actually came from the same source. */
+ if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+ &ct->master->tuplehash[exp->dir].tuple.src.u3)) {
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ }
+}
+
+static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *exp,
+ unsigned int matchoff,
+ unsigned int matchlen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ __be16 srcport;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ newaddr = exp->tuple.dst.u3;
+ else
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ /* If the signalling port matches the connection's source port in the
+ * original direction, try to use the destination port in the opposite
+ * direction. */
+ srcport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[dir].tuple.src.u.udp.port;
+ if (exp->tuple.dst.u.udp.port == srcport)
+ port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
+ else
+ port = ntohs(exp->tuple.dst.u.udp.port);
+
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3 = newaddr;
+ exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+ exp->dir = !dir;
+ exp->expectfn = nf_nat_sip_expected;
+
+ for (; port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SIP");
+ return NF_DROP;
+ }
+
+ if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
+ exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ goto err;
+ }
+ }
+ return NF_ACCEPT;
+
+err:
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen;
+ char buffer[sizeof("65536")];
+ int buflen, c_len;
+
+ /* Get actual SDP length */
+ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+ SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+ c_len = *datalen - matchoff + strlen("v=");
+
+ /* Now, update SDP length */
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+
+ buflen = sprintf(buffer, "%u", c_len);
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ char *buffer, int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+
+ if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
+ &matchoff, &matchlen) <= 0)
+ return -ENOENT;
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL;
+}
+
+static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
+ sdpoff, type, term, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ u_int16_t port)
+{
+ char buffer[sizeof("nnnnn")];
+ unsigned int buflen;
+
+ buflen = sprintf(buffer, "%u", port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ /* Mangle session description owner and contact addresses */
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
+ return 0;
+
+ switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ buffer, buflen)) {
+ case 0:
+ /*
+ * RFC 2327:
+ *
+ * Session description
+ *
+ * c=* (connection information - not required if included in all media)
+ */
+ case -ENOENT:
+ break;
+ default:
+ return 0;
+ }
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp,
+ unsigned int mediaoff,
+ unsigned int medialen,
+ union nf_inet_addr *rtp_addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ u_int16_t port;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ *rtp_addr = rtp_exp->tuple.dst.u3;
+ else
+ *rtp_addr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ rtp_exp->saved_addr = rtp_exp->tuple.dst.u3;
+ rtp_exp->tuple.dst.u3 = *rtp_addr;
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->dir = !dir;
+ rtp_exp->expectfn = nf_nat_sip_expected;
+
+ rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3;
+ rtcp_exp->tuple.dst.u3 = *rtp_addr;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->dir = !dir;
+ rtcp_exp->expectfn = nf_nat_sip_expected;
+
+ /* Try to get same pair of ports: if not, try to change them. */
+ for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ port != 0; port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == -EBUSY)
+ continue;
+ else if (ret < 0) {
+ port = 0;
+ break;
+ }
+ rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SDP media");
+ goto err1;
+ }
+
+ /* Update media port. */
+ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
+ !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen,
+ mediaoff, medialen, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP message");
+ goto err2;
+ }
+
+ return NF_ACCEPT;
+
+err2:
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+err1:
+ return NF_DROP;
+}
+
+static struct nf_ct_helper_expectfn sip_nat = {
+ .name = "sip",
+ .expectfn = nf_nat_sip_expected,
+};
+
+static void __exit nf_nat_sip_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_sip_hooks, NULL);
+
+ nf_ct_helper_expectfn_unregister(&sip_nat);
+ synchronize_rcu();
+}
+
+static const struct nf_nat_sip_hooks sip_hooks = {
+ .msg = nf_nat_sip,
+ .seq_adjust = nf_nat_sip_seq_adjust,
+ .expect = nf_nat_sip_expect,
+ .sdp_addr = nf_nat_sdp_addr,
+ .sdp_port = nf_nat_sdp_port,
+ .sdp_session = nf_nat_sdp_session,
+ .sdp_media = nf_nat_sdp_media,
+};
+
+static int __init nf_nat_sip_init(void)
+{
+ BUG_ON(nf_nat_sip_hooks != NULL);
+ RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks);
+ nf_ct_helper_expectfn_register(&sip_nat);
+ return 0;
+}
+
+module_init(nf_nat_sip_init);
+module_exit(nf_nat_sip_fini);
diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c
new file mode 100644
index 00000000000..7f67e1d5310
--- /dev/null
+++ b/net/netfilter/nf_nat_tftp.c
@@ -0,0 +1,52 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_tftp");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_expect *exp)
+{
+ const struct nf_conn *ct = exp->master;
+
+ exp->saved_proto.udp.port
+ = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, exp->master, "cannot add expectation");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+static void __exit nf_nat_tftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_tftp_init(void)
+{
+ BUG_ON(nf_nat_tftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_tftp_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_tftp_init);
+module_exit(nf_nat_tftp_fini);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d3a4f30a7f2..5d24b1fdb59 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -1,343 +1,228 @@
-#include <linux/config.h>
+/*
+ * Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
+ */
+
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
#include <net/protocol.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
#include "nf_internals.h"
-/*
- * A queue handler may be registered for each protocol. Each is protected by
- * long term mutex. The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
+/*
+ * Hook for nfnetlink_queue to register its queue handler.
+ * We do this so that most of the NFQUEUE code can be modular.
+ *
+ * Once the queue is registered it must reinject all packets it
+ * receives, no matter what.
*/
-static struct nf_queue_handler *queue_handler[NPROTO];
-static struct nf_queue_rerouter *queue_rerouter;
-
-static DEFINE_RWLOCK(queue_handler_lock);
+static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
/* return EBUSY when somebody else is registered, return EEXIST if the
* same handler is registered, return 0 in case of success. */
-int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
-{
- int ret;
-
- if (pf >= NPROTO)
- return -EINVAL;
-
- write_lock_bh(&queue_handler_lock);
- if (queue_handler[pf] == qh)
- ret = -EEXIST;
- else if (queue_handler[pf])
- ret = -EBUSY;
- else {
- queue_handler[pf] = qh;
- ret = 0;
- }
- write_unlock_bh(&queue_handler_lock);
-
- return ret;
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
+{
+ /* should never happen, we only have one queueing backend in kernel */
+ WARN_ON(rcu_access_pointer(queue_handler));
+ rcu_assign_pointer(queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
+void nf_unregister_queue_handler(void)
{
- if (pf >= NPROTO)
- return -EINVAL;
-
- write_lock_bh(&queue_handler_lock);
- queue_handler[pf] = NULL;
- write_unlock_bh(&queue_handler_lock);
-
- return 0;
+ RCU_INIT_POINTER(queue_handler, NULL);
+ synchronize_rcu();
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
+void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
- if (pf >= NPROTO)
- return -EINVAL;
-
- write_lock_bh(&queue_handler_lock);
- memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
- write_unlock_bh(&queue_handler_lock);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
-
-int nf_unregister_queue_rerouter(int pf)
-{
- if (pf >= NPROTO)
- return -EINVAL;
+ /* Release those devices we held, or Alexey will kill me. */
+ if (entry->indev)
+ dev_put(entry->indev);
+ if (entry->outdev)
+ dev_put(entry->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge;
- write_lock_bh(&queue_handler_lock);
- memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
- write_unlock_bh(&queue_handler_lock);
- return 0;
+ if (nf_bridge->physindev)
+ dev_put(nf_bridge->physindev);
+ if (nf_bridge->physoutdev)
+ dev_put(nf_bridge->physoutdev);
+ }
+#endif
+ /* Drop reference to owner of hook which queued us. */
+ module_put(entry->elem->owner);
}
-EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
+EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
-void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
+/* Bump dev refs so they don't vanish while packet is out */
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
- int pf;
+ if (!try_module_get(entry->elem->owner))
+ return false;
- write_lock_bh(&queue_handler_lock);
- for (pf = 0; pf < NPROTO; pf++) {
- if (queue_handler[pf] == qh)
- queue_handler[pf] = NULL;
+ if (entry->indev)
+ dev_hold(entry->indev);
+ if (entry->outdev)
+ dev_hold(entry->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge;
+ struct net_device *physdev;
+
+ physdev = nf_bridge->physindev;
+ if (physdev)
+ dev_hold(physdev);
+ physdev = nf_bridge->physoutdev;
+ if (physdev)
+ dev_hold(physdev);
}
- write_unlock_bh(&queue_handler_lock);
+#endif
+
+ return true;
}
-EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
-/*
- * Any packet that leaves via this function must come back
+/*
+ * Any packet that leaves via this function must come back
* through nf_reinject().
*/
-int nf_queue(struct sk_buff **skb,
- struct list_head *elem,
- int pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- unsigned int queuenum)
+int nf_queue(struct sk_buff *skb,
+ struct nf_hook_ops *elem,
+ u_int8_t pf, unsigned int hook,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ unsigned int queuenum)
{
- int status;
- struct nf_info *info;
-#ifdef CONFIG_BRIDGE_NETFILTER
- struct net_device *physindev = NULL;
- struct net_device *physoutdev = NULL;
-#endif
+ int status = -ENOENT;
+ struct nf_queue_entry *entry = NULL;
+ const struct nf_afinfo *afinfo;
+ const struct nf_queue_handler *qh;
- /* QUEUE == DROP if noone is waiting, to be safe. */
- read_lock(&queue_handler_lock);
- if (!queue_handler[pf] || !queue_handler[pf]->outfn) {
- read_unlock(&queue_handler_lock);
- kfree_skb(*skb);
- return 1;
- }
+ /* QUEUE == DROP if no one is waiting, to be safe. */
+ rcu_read_lock();
- info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
- if (!info) {
- if (net_ratelimit())
- printk(KERN_ERR "OOM queueing packet %p\n",
- *skb);
- read_unlock(&queue_handler_lock);
- kfree_skb(*skb);
- return 1;
+ qh = rcu_dereference(queue_handler);
+ if (!qh) {
+ status = -ESRCH;
+ goto err_unlock;
}
- *info = (struct nf_info) {
- (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
+ afinfo = nf_get_afinfo(pf);
+ if (!afinfo)
+ goto err_unlock;
- /* If it's going away, ignore hook. */
- if (!try_module_get(info->elem->owner)) {
- read_unlock(&queue_handler_lock);
- kfree(info);
- return 0;
+ entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
+ if (!entry) {
+ status = -ENOMEM;
+ goto err_unlock;
}
- /* Bump dev refs so they don't vanish while packet is out */
- if (indev) dev_hold(indev);
- if (outdev) dev_hold(outdev);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
- if ((*skb)->nf_bridge) {
- physindev = (*skb)->nf_bridge->physindev;
- if (physindev) dev_hold(physindev);
- physoutdev = (*skb)->nf_bridge->physoutdev;
- if (physoutdev) dev_hold(physoutdev);
+ *entry = (struct nf_queue_entry) {
+ .skb = skb,
+ .elem = elem,
+ .pf = pf,
+ .hook = hook,
+ .indev = indev,
+ .outdev = outdev,
+ .okfn = okfn,
+ .size = sizeof(*entry) + afinfo->route_key_size,
+ };
+
+ if (!nf_queue_entry_get_refs(entry)) {
+ status = -ECANCELED;
+ goto err_unlock;
}
-#endif
- if (queue_rerouter[pf].save)
- queue_rerouter[pf].save(*skb, info);
+ skb_dst_force(skb);
+ afinfo->saveroute(skb, entry);
+ status = qh->outfn(entry, queuenum);
- status = queue_handler[pf]->outfn(*skb, info, queuenum,
- queue_handler[pf]->data);
-
- if (status >= 0 && queue_rerouter[pf].reroute)
- status = queue_rerouter[pf].reroute(skb, info);
-
- read_unlock(&queue_handler_lock);
+ rcu_read_unlock();
if (status < 0) {
- /* James M doesn't say fuck enough. */
- if (indev) dev_put(indev);
- if (outdev) dev_put(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (physindev) dev_put(physindev);
- if (physoutdev) dev_put(physoutdev);
-#endif
- module_put(info->elem->owner);
- kfree(info);
- kfree_skb(*skb);
-
- return 1;
+ nf_queue_entry_release_refs(entry);
+ goto err;
}
- return 1;
+ return 0;
+
+err_unlock:
+ rcu_read_unlock();
+err:
+ kfree(entry);
+ return status;
}
-void nf_reinject(struct sk_buff *skb, struct nf_info *info,
- unsigned int verdict)
+void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct list_head *elem = &info->elem->list;
- struct list_head *i;
+ struct sk_buff *skb = entry->skb;
+ struct nf_hook_ops *elem = entry->elem;
+ const struct nf_afinfo *afinfo;
+ int err;
rcu_read_lock();
- /* Release those devices we held, or Alexey will kill me. */
- if (info->indev) dev_put(info->indev);
- if (info->outdev) dev_put(info->outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- if (skb->nf_bridge->physindev)
- dev_put(skb->nf_bridge->physindev);
- if (skb->nf_bridge->physoutdev)
- dev_put(skb->nf_bridge->physoutdev);
- }
-#endif
-
- /* Drop reference to owner of hook which queued us. */
- module_put(info->elem->owner);
-
- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
- if (i == elem)
- break;
- }
-
- if (elem == &nf_hooks[info->pf][info->hook]) {
- /* The module which sent it to userspace is gone. */
- NFDEBUG("%s: module disappeared, dropping packet.\n",
- __FUNCTION__);
- verdict = NF_DROP;
- }
+ nf_queue_entry_release_refs(entry);
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT) {
- elem = elem->prev;
+ elem = list_entry(elem->list.prev, struct nf_hook_ops, list);
verdict = NF_ACCEPT;
}
if (verdict == NF_ACCEPT) {
+ afinfo = nf_get_afinfo(entry->pf);
+ if (!afinfo || afinfo->reroute(skb, entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (verdict == NF_ACCEPT) {
next_hook:
- verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
- &skb, info->hook,
- info->indev, info->outdev, &elem,
- info->okfn, INT_MIN);
+ verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook],
+ skb, entry->hook,
+ entry->indev, entry->outdev, &elem,
+ entry->okfn, INT_MIN);
}
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
- info->okfn(skb);
+ case NF_STOP:
+ local_bh_disable();
+ entry->okfn(skb);
+ local_bh_enable();
break;
-
case NF_QUEUE:
- if (!nf_queue(&skb, elem, info->pf, info->hook,
- info->indev, info->outdev, info->okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ err = nf_queue(skb, elem, entry->pf, entry->hook,
+ entry->indev, entry->outdev, entry->okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
+ break;
+ case NF_STOLEN:
break;
+ default:
+ kfree_skb(skb);
}
rcu_read_unlock();
-
- if (verdict == NF_DROP)
- kfree_skb(skb);
-
- kfree(info);
- return;
+ kfree(entry);
}
EXPORT_SYMBOL(nf_reinject);
-
-#ifdef CONFIG_PROC_FS
-static void *seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos >= NPROTO)
- return NULL;
-
- return pos;
-}
-
-static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
-
- if (*pos >= NPROTO)
- return NULL;
-
- return pos;
-}
-
-static void seq_stop(struct seq_file *s, void *v)
-{
-
-}
-
-static int seq_show(struct seq_file *s, void *v)
-{
- int ret;
- loff_t *pos = v;
- struct nf_queue_handler *qh;
-
- read_lock_bh(&queue_handler_lock);
- qh = queue_handler[*pos];
- if (!qh)
- ret = seq_printf(s, "%2lld NONE\n", *pos);
- else
- ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
- read_unlock_bh(&queue_handler_lock);
-
- return ret;
-}
-
-static struct seq_operations nfqueue_seq_ops = {
- .start = seq_start,
- .next = seq_next,
- .stop = seq_stop,
- .show = seq_show,
-};
-
-static int nfqueue_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &nfqueue_seq_ops);
-}
-
-static struct file_operations nfqueue_file_ops = {
- .owner = THIS_MODULE,
- .open = nfqueue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-#endif /* PROC_FS */
-
-
-int __init netfilter_queue_init(void)
-{
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *pde;
-#endif
- queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
- GFP_KERNEL);
- if (!queue_rerouter)
- return -ENOMEM;
-
-#ifdef CONFIG_PROC_FS
- pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
- if (!pde) {
- kfree(queue_rerouter);
- return -1;
- }
- pde->proc_fops = &nfqueue_file_ops;
-#endif
- memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
-
- return 0;
-}
-
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 61a833a9caa..f042ae52155 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -1,9 +1,9 @@
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter.h>
+#include <linux/mutex.h>
#include <net/sock.h>
#include "nf_internals.h"
@@ -11,7 +11,7 @@
/* Sockopts only registered and called from user context, so
net locking would be overkill. Also, [gs]etsockopt calls may
sleep. */
-static DECLARE_MUTEX(nf_sockopt_mutex);
+static DEFINE_MUTEX(nf_sockopt_mutex);
static LIST_HEAD(nf_sockopts);
/* Do exclusive ranges overlap? */
@@ -23,22 +23,21 @@ static inline int overlap(int min1, int max1, int min2, int max2)
/* Functions to register sockopt ranges (exclusive). */
int nf_register_sockopt(struct nf_sockopt_ops *reg)
{
- struct list_head *i;
+ struct nf_sockopt_ops *ops;
int ret = 0;
- if (down_interruptible(&nf_sockopt_mutex) != 0)
+ if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
return -EINTR;
- list_for_each(i, &nf_sockopts) {
- struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+ list_for_each_entry(ops, &nf_sockopts, list) {
if (ops->pf == reg->pf
- && (overlap(ops->set_optmin, ops->set_optmax,
+ && (overlap(ops->set_optmin, ops->set_optmax,
reg->set_optmin, reg->set_optmax)
- || overlap(ops->get_optmin, ops->get_optmax,
+ || overlap(ops->get_optmin, ops->get_optmax,
reg->get_optmin, reg->get_optmax))) {
NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
- ops->set_optmin, ops->set_optmax,
- ops->get_optmin, ops->get_optmax,
+ ops->set_optmin, ops->set_optmax,
+ ops->get_optmin, ops->get_optmax,
reg->set_optmin, reg->set_optmax,
reg->get_optmin, reg->get_optmax);
ret = -EBUSY;
@@ -48,85 +47,123 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg)
list_add(&reg->list, &nf_sockopts);
out:
- up(&nf_sockopt_mutex);
+ mutex_unlock(&nf_sockopt_mutex);
return ret;
}
EXPORT_SYMBOL(nf_register_sockopt);
void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
{
- /* No point being interruptible: we're probably in cleanup_module() */
- restart:
- down(&nf_sockopt_mutex);
- if (reg->use != 0) {
- /* To be woken by nf_sockopt call... */
- /* FIXME: Stuart Young's name appears gratuitously. */
- set_current_state(TASK_UNINTERRUPTIBLE);
- reg->cleanup_task = current;
- up(&nf_sockopt_mutex);
- schedule();
- goto restart;
- }
+ mutex_lock(&nf_sockopt_mutex);
list_del(&reg->list);
- up(&nf_sockopt_mutex);
+ mutex_unlock(&nf_sockopt_mutex);
}
EXPORT_SYMBOL(nf_unregister_sockopt);
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val,
- char __user *opt, int *len, int get)
+static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
+ int val, int get)
{
- struct list_head *i;
struct nf_sockopt_ops *ops;
- int ret;
- if (down_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
+ if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
+ return ERR_PTR(-EINTR);
- list_for_each(i, &nf_sockopts) {
- ops = (struct nf_sockopt_ops *)i;
+ list_for_each_entry(ops, &nf_sockopts, list) {
if (ops->pf == pf) {
+ if (!try_module_get(ops->owner))
+ goto out_nosup;
+
if (get) {
- if (val >= ops->get_optmin
- && val < ops->get_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->get(sk, val, opt, len);
+ if (val >= ops->get_optmin &&
+ val < ops->get_optmax)
goto out;
- }
} else {
- if (val >= ops->set_optmin
- && val < ops->set_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->set(sk, val, opt, *len);
+ if (val >= ops->set_optmin &&
+ val < ops->set_optmax)
goto out;
- }
}
+ module_put(ops->owner);
}
}
- up(&nf_sockopt_mutex);
- return -ENOPROTOOPT;
-
- out:
- down(&nf_sockopt_mutex);
- ops->use--;
- if (ops->cleanup_task)
- wake_up_process(ops->cleanup_task);
- up(&nf_sockopt_mutex);
+out_nosup:
+ ops = ERR_PTR(-ENOPROTOOPT);
+out:
+ mutex_unlock(&nf_sockopt_mutex);
+ return ops;
+}
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ ops = nf_sockopt_find(sk, pf, val, get);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ if (get)
+ ret = ops->get(sk, val, opt, len);
+ else
+ ret = ops->set(sk, val, opt, *len);
+
+ module_put(ops->owner);
return ret;
}
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
- int len)
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+ unsigned int len)
{
return nf_sockopt(sk, pf, val, opt, &len, 0);
}
EXPORT_SYMBOL(nf_setsockopt);
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+ int *len)
{
return nf_sockopt(sk, pf, val, opt, len, 1);
}
EXPORT_SYMBOL(nf_getsockopt);
+#ifdef CONFIG_COMPAT
+static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ ops = nf_sockopt_find(sk, pf, val, get);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ if (get) {
+ if (ops->compat_get)
+ ret = ops->compat_get(sk, val, opt, len);
+ else
+ ret = ops->get(sk, val, opt, len);
+ } else {
+ if (ops->compat_set)
+ ret = ops->compat_set(sk, val, opt, *len);
+ else
+ ret = ops->set(sk, val, opt, *len);
+ }
+
+ module_put(ops->owner);
+ return ret;
+}
+
+int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
+ int val, char __user *opt, unsigned int len)
+{
+ return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(compat_nf_setsockopt);
+
+int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
+ int val, char __user *opt, int *len)
+{
+ return compat_nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(compat_nf_getsockopt);
+#endif
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
new file mode 100644
index 00000000000..52e20c9a46a
--- /dev/null
+++ b/net/netfilter/nf_synproxy_core.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+int synproxy_net_id;
+EXPORT_SYMBOL_GPL(synproxy_net_id);
+
+bool
+synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+ const struct tcphdr *th, struct synproxy_options *opts)
+{
+ int length = (th->doff * 4) - sizeof(*th);
+ u8 buf[40], *ptr;
+
+ ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
+ if (ptr == NULL)
+ return false;
+
+ opts->options = 0;
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return true;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2)
+ return true;
+ if (opsize > length)
+ return true;
+
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS) {
+ opts->mss = get_unaligned_be16(ptr);
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW) {
+ opts->wscale = *ptr;
+ if (opts->wscale > 14)
+ opts->wscale = 14;
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opsize == TCPOLEN_TIMESTAMP) {
+ opts->tsval = get_unaligned_be32(ptr);
+ opts->tsecr = get_unaligned_be32(ptr + 4);
+ opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM)
+ opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ break;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_parse_options);
+
+unsigned int synproxy_options_size(const struct synproxy_options *opts)
+{
+ unsigned int size = 0;
+
+ if (opts->options & XT_SYNPROXY_OPT_MSS)
+ size += TCPOLEN_MSS_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ size += TCPOLEN_SACKPERM_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ size += TCPOLEN_WSCALE_ALIGNED;
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(synproxy_options_size);
+
+void
+synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
+{
+ __be32 *ptr = (__be32 *)(th + 1);
+ u8 options = opts->options;
+
+ if (options & XT_SYNPROXY_OPT_MSS)
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+
+ if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
+
+ if (options & XT_SYNPROXY_OPT_WSCALE)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->wscale);
+}
+EXPORT_SYMBOL_GPL(synproxy_build_options);
+
+void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+ struct synproxy_options *opts)
+{
+ opts->tsecr = opts->tsval;
+ opts->tsval = tcp_time_stamp & ~0x3f;
+
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ opts->tsval |= opts->wscale;
+ opts->wscale = info->wscale;
+ } else
+ opts->tsval |= 0xf;
+
+ if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ opts->tsval |= 1 << 4;
+
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ opts->tsval |= 1 << 5;
+}
+EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
+
+void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+{
+ opts->wscale = opts->tsecr & 0xf;
+ if (opts->wscale != 0xf)
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+
+ opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+
+ opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+}
+EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
+
+unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *th,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
+{
+ unsigned int optoff, optend;
+ u32 *ptr, old;
+
+ if (synproxy->tsoff == 0)
+ return 1;
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + th->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ while (optoff < optend) {
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_TIMESTAMP &&
+ op[1] == TCPOLEN_TIMESTAMP) {
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ ptr = (u32 *)&op[2];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) -
+ synproxy->tsoff);
+ } else {
+ ptr = (u32 *)&op[6];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) +
+ synproxy->tsoff);
+ }
+ inet_proto_csum_replace4(&th->check, skb,
+ old, *ptr, 0);
+ return 1;
+ }
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
+
+static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_synproxy),
+ .align = __alignof__(struct nf_conn_synproxy),
+ .id = NF_CT_EXT_SYNPROXY,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct synproxy_stats *stats = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries\t\tsyn_received\t"
+ "cookie_invalid\tcookie_valid\t"
+ "cookie_retrans\tconn_reopened\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
+ stats->syn_received,
+ stats->cookie_invalid,
+ stats->cookie_valid,
+ stats->cookie_retrans,
+ stats->conn_reopened);
+
+ return 0;
+}
+
+static const struct seq_operations synproxy_cpu_seq_ops = {
+ .start = synproxy_cpu_seq_start,
+ .next = synproxy_cpu_seq_next,
+ .stop = synproxy_cpu_seq_stop,
+ .show = synproxy_cpu_seq_show,
+};
+
+static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations synproxy_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = synproxy_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+ &synproxy_cpu_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ remove_proc_entry("synproxy", net->proc_net_stat);
+}
+#else
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ return;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init synproxy_net_init(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct nf_conntrack_tuple t;
+ struct nf_conn *ct;
+ int err = -ENOMEM;
+
+ memset(&t, 0, sizeof(t));
+ ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+ if (IS_ERR(ct)) {
+ err = PTR_ERR(ct);
+ goto err1;
+ }
+
+ if (!nfct_seqadj_ext_add(ct))
+ goto err2;
+ if (!nfct_synproxy_ext_add(ct))
+ goto err2;
+
+ nf_conntrack_tmpl_insert(net, ct);
+ snet->tmpl = ct;
+
+ snet->stats = alloc_percpu(struct synproxy_stats);
+ if (snet->stats == NULL)
+ goto err2;
+
+ err = synproxy_proc_init(net);
+ if (err < 0)
+ goto err3;
+
+ return 0;
+
+err3:
+ free_percpu(snet->stats);
+err2:
+ nf_conntrack_free(ct);
+err1:
+ return err;
+}
+
+static void __net_exit synproxy_net_exit(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+
+ nf_ct_put(snet->tmpl);
+ synproxy_proc_exit(net);
+ free_percpu(snet->stats);
+}
+
+static struct pernet_operations synproxy_net_ops = {
+ .init = synproxy_net_init,
+ .exit = synproxy_net_exit,
+ .id = &synproxy_net_id,
+ .size = sizeof(struct synproxy_net),
+};
+
+static int __init synproxy_core_init(void)
+{
+ int err;
+
+ err = nf_ct_extend_register(&nf_ct_synproxy_extend);
+ if (err < 0)
+ goto err1;
+
+ err = register_pernet_subsys(&synproxy_net_ops);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+err1:
+ return err;
+}
+
+static void __exit synproxy_core_exit(void)
+{
+ unregister_pernet_subsys(&synproxy_net_ops);
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+}
+
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
new file mode 100644
index 00000000000..8746ff9a835
--- /dev/null
+++ b/net/netfilter/nf_tables_api.c
@@ -0,0 +1,4041 @@
+/*
+ * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+static LIST_HEAD(nf_tables_expressions);
+
+/**
+ * nft_register_afinfo - register nf_tables address family info
+ *
+ * @afi: address family info to register
+ *
+ * Register the address family for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
+{
+ INIT_LIST_HEAD(&afi->tables);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&afi->list, &net->nft.af_info);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_afinfo);
+
+/**
+ * nft_unregister_afinfo - unregister nf_tables address family info
+ *
+ * @afi: address family info to unregister
+ *
+ * Unregister the address family for use with nf_tables.
+ */
+void nft_unregister_afinfo(struct nft_af_info *afi)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&afi->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
+
+static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
+{
+ struct nft_af_info *afi;
+
+ list_for_each_entry(afi, &net->nft.af_info, list) {
+ if (afi->family == family)
+ return afi;
+ }
+ return NULL;
+}
+
+static struct nft_af_info *
+nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
+{
+ struct nft_af_info *afi;
+
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return afi;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-afinfo-%u", family);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+static void nft_ctx_init(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nft_af_info *afi,
+ struct nft_table *table,
+ struct nft_chain *chain,
+ const struct nlattr * const *nla)
+{
+ ctx->net = sock_net(skb->sk);
+ ctx->afi = afi;
+ ctx->table = table;
+ ctx->chain = chain;
+ ctx->nla = nla;
+ ctx->portid = NETLINK_CB(skb).portid;
+ ctx->report = nlmsg_report(nlh);
+ ctx->seq = nlh->nlmsg_seq;
+}
+
+static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
+ u32 size)
+{
+ struct nft_trans *trans;
+
+ trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ if (trans == NULL)
+ return NULL;
+
+ trans->msg_type = msg_type;
+ trans->ctx = *ctx;
+
+ return trans;
+}
+
+static void nft_trans_destroy(struct nft_trans *trans)
+{
+ list_del(&trans->list);
+ kfree(trans);
+}
+
+/*
+ * Tables
+ */
+
+static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ list_for_each_entry(table, &afi->tables, list) {
+ if (!nla_strcmp(nla, table->name))
+ return table;
+ }
+ return NULL;
+}
+
+static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ table = nft_table_lookup(afi, nla);
+ if (table != NULL)
+ return table;
+
+ return ERR_PTR(-ENOENT);
+}
+
+static inline u64 nf_tables_alloc_handle(struct nft_table *table)
+{
+ return ++table->hgenerator;
+}
+
+static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static const struct nf_chain_type *
+__nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+ int i;
+
+ for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
+ if (chain_type[family][i] != NULL &&
+ !nla_strcmp(nla, chain_type[family][i]->name))
+ return chain_type[family][i];
+ }
+ return NULL;
+}
+
+static const struct nf_chain_type *
+nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla,
+ bool autoload)
+{
+ const struct nf_chain_type *type;
+
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return type;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-chain-%u-%.*s", afi->family,
+ nla_len(nla), (const char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
+ [NFTA_TABLE_NAME] = { .type = NLA_STRING },
+ [NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
+ nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
+ nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_tables(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_table_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWTABLE,
+ NLM_F_MULTI,
+ afi->family, table) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+/* Internal table flags */
+#define NFT_TABLE_INACTIVE (1 << 15)
+
+static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_tables,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
+ family, table);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_table_enable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+ int err, i = 0;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err;
+
+ i++;
+ }
+ return 0;
+err:
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ if (i-- <= 0)
+ break;
+
+ nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops);
+ }
+ return err;
+}
+
+static void nf_tables_table_disable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->flags & NFT_BASE_CHAIN)
+ nf_unregister_hooks(nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+}
+
+static int nf_tables_updtable(struct nft_ctx *ctx)
+{
+ struct nft_trans *trans;
+ u32 flags;
+ int ret = 0;
+
+ if (!ctx->nla[NFTA_TABLE_FLAGS])
+ return 0;
+
+ flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+
+ if (flags == ctx->table->flags)
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
+ sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if ((flags & NFT_TABLE_F_DORMANT) &&
+ !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_trans_table_enable(trans) = false;
+ } else if (!(flags & NFT_TABLE_F_DORMANT) &&
+ ctx->table->flags & NFT_TABLE_F_DORMANT) {
+ ret = nf_tables_table_enable(ctx->afi, ctx->table);
+ if (ret >= 0) {
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ nft_trans_table_enable(trans) = true;
+ }
+ }
+ if (ret < 0)
+ goto err;
+
+ nft_trans_table_update(trans) = true;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+err:
+ nft_trans_destroy(trans);
+ return ret;
+}
+
+static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWTABLE)
+ ctx->table->flags |= NFT_TABLE_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr *name;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u32 flags = 0;
+ struct nft_ctx ctx;
+ int err;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ name = nla[NFTA_TABLE_NAME];
+ table = nf_tables_table_lookup(afi, name);
+ if (IS_ERR(table)) {
+ if (PTR_ERR(table) != -ENOENT)
+ return PTR_ERR(table);
+ table = NULL;
+ }
+
+ if (table != NULL) {
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ return nf_tables_updtable(&ctx);
+ }
+
+ if (nla[NFTA_TABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+ }
+
+ if (!try_module_get(afi->owner))
+ return -EAFNOSUPPORT;
+
+ table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL);
+ if (table == NULL) {
+ module_put(afi->owner);
+ return -ENOMEM;
+ }
+
+ nla_strlcpy(table->name, name, nla_len(name));
+ INIT_LIST_HEAD(&table->chains);
+ INIT_LIST_HEAD(&table->sets);
+ table->flags = flags;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
+ if (err < 0) {
+ kfree(table);
+ module_put(afi->owner);
+ return err;
+ }
+ list_add_tail_rcu(&table->list, &afi->tables);
+ return 0;
+}
+
+static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family, err;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ if (table->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&table->list);
+ return 0;
+}
+
+static void nf_tables_table_destroy(struct nft_ctx *ctx)
+{
+ BUG_ON(ctx->table->use > 0);
+
+ kfree(ctx->table);
+ module_put(ctx->afi->owner);
+}
+
+int nft_register_chain_type(const struct nf_chain_type *ctype)
+{
+ int err = 0;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (chain_type[ctype->family][ctype->type] != NULL) {
+ err = -EBUSY;
+ goto out;
+ }
+ chain_type[ctype->family][ctype->type] = ctype;
+out:
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
+
+void nft_unregister_chain_type(const struct nf_chain_type *ctype)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ chain_type[ctype->family][ctype->type] = NULL;
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
+
+/*
+ * Chains
+ */
+
+static struct nft_chain *
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->handle == handle)
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_chain *chain;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nla_strcmp(nla, chain->name))
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
+ [NFTA_CHAIN_TABLE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
+ [NFTA_CHAIN_NAME] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED },
+ [NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
+ [NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
+ [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 },
+};
+
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+ struct nft_stats *cpu_stats, total;
+ struct nlattr *nest;
+ unsigned int seq;
+ u64 pkts, bytes;
+ int cpu;
+
+ memset(&total, 0, sizeof(total));
+ for_each_possible_cpu(cpu) {
+ cpu_stats = per_cpu_ptr(stats, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ pkts = cpu_stats->pkts;
+ bytes = cpu_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ total.pkts += pkts;
+ total.bytes += bytes;
+ }
+ nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+ nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table,
+ const struct nft_chain *chain)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
+ goto nla_put_failure;
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain = nft_base_chain(chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+ htonl(basechain->policy)))
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
+ goto nla_put_failure;
+
+ if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table,
+ ctx->chain);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_chains(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWCHAIN,
+ NLM_F_MULTI,
+ afi->family, table, chain) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_chains,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
+ family, table, chain);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
+{
+ struct nlattr *tb[NFTA_COUNTER_MAX+1];
+ struct nft_stats __percpu *newstats;
+ struct nft_stats *stats;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+ return ERR_PTR(-EINVAL);
+
+ newstats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (newstats == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ /* Restore old counters on this cpu, no problem. Per-cpu statistics
+ * are not exposed to userspace.
+ */
+ stats = this_cpu_ptr(newstats);
+ stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+
+ return newstats;
+}
+
+static void nft_chain_stats_replace(struct nft_base_chain *chain,
+ struct nft_stats __percpu *newstats)
+{
+ if (chain->stats) {
+ struct nft_stats __percpu *oldstats =
+ nft_dereference(chain->stats);
+
+ rcu_assign_pointer(chain->stats, newstats);
+ synchronize_rcu();
+ free_percpu(oldstats);
+ } else
+ rcu_assign_pointer(chain->stats, newstats);
+}
+
+static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWCHAIN)
+ ctx->chain->flags |= NFT_CHAIN_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static void nf_tables_chain_destroy(struct nft_chain *chain)
+{
+ BUG_ON(chain->use > 0);
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ module_put(nft_base_chain(chain)->type->owner);
+ free_percpu(nft_base_chain(chain)->stats);
+ kfree(nft_base_chain(chain));
+ } else {
+ kfree(chain);
+ }
+}
+
+static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr * uninitialized_var(name);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_base_chain *basechain = NULL;
+ struct nlattr *ha[NFTA_HOOK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u8 policy = NF_ACCEPT;
+ u64 handle = 0;
+ unsigned int i;
+ struct nft_stats __percpu *stats;
+ int err;
+ bool create;
+ struct nft_ctx ctx;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = NULL;
+ name = nla[NFTA_CHAIN_NAME];
+
+ if (nla[NFTA_CHAIN_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+ chain = nf_tables_chain_lookup_byhandle(table, handle);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
+ chain = nf_tables_chain_lookup(table, name);
+ if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) != -ENOENT)
+ return PTR_ERR(chain);
+ chain = NULL;
+ }
+ }
+
+ if (nla[NFTA_CHAIN_POLICY]) {
+ if ((chain != NULL &&
+ !(chain->flags & NFT_BASE_CHAIN)) ||
+ nla[NFTA_CHAIN_HOOK] == NULL)
+ return -EOPNOTSUPP;
+
+ policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
+ switch (policy) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (chain != NULL) {
+ struct nft_stats *stats = NULL;
+ struct nft_trans *trans;
+
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name &&
+ !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
+ return -EEXIST;
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ return -EOPNOTSUPP;
+
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats))
+ return PTR_ERR(stats);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
+ sizeof(struct nft_trans_chain));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ nft_trans_chain_stats(trans) = stats;
+ nft_trans_chain_update(trans) = true;
+
+ if (nla[NFTA_CHAIN_POLICY])
+ nft_trans_chain_policy(trans) = policy;
+ else
+ nft_trans_chain_policy(trans) = -1;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name) {
+ nla_strlcpy(nft_trans_chain_name(trans), name,
+ NFT_CHAIN_MAXNAMELEN);
+ }
+ list_add_tail(&trans->list, &net->nft.commit_list);
+ return 0;
+ }
+
+ if (table->use == UINT_MAX)
+ return -EOVERFLOW;
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ const struct nf_chain_type *type;
+ struct nf_hook_ops *ops;
+ nf_hookfn *hookfn;
+ u32 hooknum, priority;
+
+ type = chain_type[family][NFT_CHAIN_T_DEFAULT];
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(afi,
+ nla[NFTA_CHAIN_TYPE],
+ create);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+ }
+
+ err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+ nft_hook_policy);
+ if (err < 0)
+ return err;
+ if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+ ha[NFTA_HOOK_PRIORITY] == NULL)
+ return -EINVAL;
+
+ hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hooknum >= afi->nhooks)
+ return -EINVAL;
+ priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+
+ if (!(type->hook_mask & (1 << hooknum)))
+ return -EOPNOTSUPP;
+ if (!try_module_get(type->owner))
+ return -ENOENT;
+ hookfn = type->hooks[hooknum];
+
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ if (basechain == NULL)
+ return -ENOMEM;
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats)) {
+ module_put(type->owner);
+ kfree(basechain);
+ return PTR_ERR(stats);
+ }
+ basechain->stats = stats;
+ } else {
+ stats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (IS_ERR(stats)) {
+ module_put(type->owner);
+ kfree(basechain);
+ return PTR_ERR(stats);
+ }
+ rcu_assign_pointer(basechain->stats, stats);
+ }
+
+ basechain->type = type;
+ chain = &basechain->chain;
+
+ for (i = 0; i < afi->nops; i++) {
+ ops = &basechain->ops[i];
+ ops->pf = family;
+ ops->owner = afi->owner;
+ ops->hooknum = hooknum;
+ ops->priority = priority;
+ ops->priv = chain;
+ ops->hook = afi->hooks[ops->hooknum];
+ if (hookfn)
+ ops->hook = hookfn;
+ if (afi->hook_ops_init)
+ afi->hook_ops_init(ops, i);
+ }
+
+ chain->flags |= NFT_BASE_CHAIN;
+ basechain->policy = policy;
+ } else {
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (chain == NULL)
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&chain->rules);
+ chain->handle = nf_tables_alloc_handle(table);
+ chain->net = net;
+ chain->table = table;
+ nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN) {
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err1;
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
+ if (err < 0)
+ goto err2;
+
+ table->use++;
+ list_add_tail_rcu(&chain->list, &table->chains);
+ return 0;
+err2:
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+err1:
+ nf_tables_chain_destroy(chain);
+ return err;
+}
+
+static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_ctx ctx;
+ int err;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (chain->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN);
+ if (err < 0)
+ return err;
+
+ table->use--;
+ list_del_rcu(&chain->list);
+ return 0;
+}
+
+/*
+ * Expressions
+ */
+
+/**
+ * nft_register_expr - register nf_tables expr type
+ * @ops: expr type
+ *
+ * Registers the expr type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (type->family == NFPROTO_UNSPEC)
+ list_add_tail_rcu(&type->list, &nf_tables_expressions);
+ else
+ list_add_rcu(&type->list, &nf_tables_expressions);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ * nft_unregister_expr - unregister nf_tables expr type
+ * @ops: expr type
+ *
+ * Unregisters the expr typefor use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+static const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ list_for_each_entry(type, &nf_tables_expressions, list) {
+ if (!nla_strcmp(nla, type->name) &&
+ (!type->family || type->family == family))
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_expr_type *nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ type = __nft_expr_type_get(family, nla);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_DATA] = { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_expr_info(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
+ goto nla_put_failure;
+
+ if (expr->ops->dump) {
+ struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+ if (data == NULL)
+ goto nla_put_failure;
+ if (expr->ops->dump(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, data);
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+ const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ const struct nft_expr_type *type;
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+ if (err < 0)
+ return err;
+
+ type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ if (tb[NFTA_EXPR_DATA]) {
+ err = nla_parse_nested(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA], type->policy);
+ if (err < 0)
+ goto err1;
+ } else
+ memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
+
+ if (type->select_ops != NULL) {
+ ops = type->select_ops(ctx,
+ (const struct nlattr * const *)info->tb);
+ if (IS_ERR(ops)) {
+ err = PTR_ERR(ops);
+ goto err1;
+ }
+ } else
+ ops = type->ops;
+
+ info->ops = ops;
+ return 0;
+
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_newexpr(const struct nft_ctx *ctx,
+ const struct nft_expr_info *info,
+ struct nft_expr *expr)
+{
+ const struct nft_expr_ops *ops = info->ops;
+ int err;
+
+ expr->ops = ops;
+ if (ops->init) {
+ err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ if (err < 0)
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ expr->ops = NULL;
+ return err;
+}
+
+static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy)
+ expr->ops->destroy(ctx, expr);
+ module_put(expr->ops->type->owner);
+}
+
+/*
+ * Rules
+ */
+
+static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
+ u64 handle)
+{
+ struct nft_rule *rule;
+
+ // FIXME: this sucks
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (handle == rule->handle)
+ return rule;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
+ const struct nlattr *nla)
+{
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+}
+
+static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
+ [NFTA_RULE_TABLE] = { .type = NLA_STRING },
+ [NFTA_RULE_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_RULE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
+ [NFTA_RULE_POSITION] = { .type = NLA_U64 },
+ [NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
+};
+
+static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, u32 flags, int family,
+ const struct nft_table *table,
+ const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ const struct nft_expr *expr, *next;
+ struct nlattr *list;
+ const struct nft_rule *prule;
+ int type = event | NFNL_SUBSYS_NFTABLES << 8;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+ goto nla_put_failure;
+
+ if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
+ prule = list_entry(rule->list.prev, struct nft_rule, list);
+ if (nla_put_be64(skb, NFTA_RULE_POSITION,
+ cpu_to_be64(prule->handle)))
+ goto nla_put_failure;
+ }
+
+ list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (list == NULL)
+ goto nla_put_failure;
+ nft_rule_for_each_expr(expr, next, rule) {
+ struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (elem == NULL)
+ goto nla_put_failure;
+ if (nf_tables_fill_expr_info(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, elem);
+ }
+ nla_nest_end(skb, list);
+
+ if (rule->ulen &&
+ nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule)))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_rule_notify(const struct nft_ctx *ctx,
+ const struct nft_rule *rule,
+ int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0,
+ ctx->afi->family, ctx->table,
+ ctx->chain, rule);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static inline bool
+nft_rule_is_active(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & (1 << net->nft.gencursor)) == 0;
+}
+
+static inline int gencursor_next(struct net *net)
+{
+ return net->nft.gencursor+1 == 1 ? 1 : 0;
+}
+
+static inline int
+nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & (1 << gencursor_next(net))) == 0;
+}
+
+static inline void
+nft_rule_activate_next(struct net *net, struct nft_rule *rule)
+{
+ /* Now inactive, will be active in the future */
+ rule->genmask = (1 << net->nft.gencursor);
+}
+
+static inline void
+nft_rule_disactivate_next(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask = (1 << gencursor_next(net));
+}
+
+static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask = 0;
+}
+
+static int nf_tables_dump_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
+ if (!nft_rule_is_active(net, rule))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWRULE,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, chain, rule) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_rules,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
+ struct nft_rule *rule)
+{
+ struct nft_expr *expr;
+
+ /*
+ * Careful: some expressions might not be initialized in case this
+ * is called on error from nf_tables_newrule().
+ */
+ expr = nft_expr_first(rule);
+ while (expr->ops && expr != nft_expr_last(rule)) {
+ nf_tables_expr_destroy(ctx, expr);
+ expr = nft_expr_next(expr);
+ }
+ kfree(rule);
+}
+
+static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_rule *rule)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_rule(trans) = rule;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return trans;
+}
+
+#define NFT_RULE_MAXEXPRS 128
+
+static struct nft_expr_info *info;
+
+static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_rule *rule, *old_rule = NULL;
+ struct nft_trans *trans = NULL;
+ struct nft_expr *expr;
+ struct nft_ctx ctx;
+ struct nlattr *tmp;
+ unsigned int size, i, n, ulen = 0;
+ int err, rem;
+ bool create;
+ u64 handle, pos_handle;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+
+ if (nla[NFTA_RULE_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
+ rule = __nf_tables_rule_lookup(chain, handle);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ old_rule = rule;
+ else
+ return -EOPNOTSUPP;
+ } else {
+ if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EINVAL;
+ handle = nf_tables_alloc_handle(table);
+
+ if (chain->use == UINT_MAX)
+ return -EOVERFLOW;
+ }
+
+ if (nla[NFTA_RULE_POSITION]) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EOPNOTSUPP;
+
+ pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
+ old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule))
+ return PTR_ERR(old_rule);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ n = 0;
+ size = 0;
+ if (nla[NFTA_RULE_EXPRESSIONS]) {
+ nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
+ err = -EINVAL;
+ if (nla_type(tmp) != NFTA_LIST_ELEM)
+ goto err1;
+ if (n == NFT_RULE_MAXEXPRS)
+ goto err1;
+ err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
+ if (err < 0)
+ goto err1;
+ size += info[n].ops->size;
+ n++;
+ }
+ }
+
+ if (nla[NFTA_RULE_USERDATA])
+ ulen = nla_len(nla[NFTA_RULE_USERDATA]);
+
+ err = -ENOMEM;
+ rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL);
+ if (rule == NULL)
+ goto err1;
+
+ nft_rule_activate_next(net, rule);
+
+ rule->handle = handle;
+ rule->dlen = size;
+ rule->ulen = ulen;
+
+ if (ulen)
+ nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen);
+
+ expr = nft_expr_first(rule);
+ for (i = 0; i < n; i++) {
+ err = nf_tables_newexpr(&ctx, &info[i], expr);
+ if (err < 0)
+ goto err2;
+ info[i].ops = NULL;
+ expr = nft_expr_next(expr);
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_rule_is_active_next(net, old_rule)) {
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
+ old_rule);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ nft_rule_disactivate_next(net, old_rule);
+ chain->use--;
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ } else {
+ err = -ENOENT;
+ goto err2;
+ }
+ } else if (nlh->nlmsg_flags & NLM_F_APPEND)
+ if (old_rule)
+ list_add_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_tail_rcu(&rule->list, &chain->rules);
+ else {
+ if (old_rule)
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_rcu(&rule->list, &chain->rules);
+ }
+
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err3;
+ }
+ chain->use++;
+ return 0;
+
+err3:
+ list_del_rcu(&rule->list);
+ if (trans) {
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ nft_rule_clear(net, nft_trans_rule(trans));
+ nft_trans_destroy(trans);
+ chain->use++;
+ }
+err2:
+ nf_tables_rule_destroy(&ctx, rule);
+err1:
+ for (i = 0; i < n; i++) {
+ if (info[i].ops != NULL)
+ module_put(info[i].ops->type->owner);
+ }
+ return err;
+}
+
+static int
+nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule)
+{
+ /* You cannot delete the same rule twice */
+ if (nft_rule_is_active_next(ctx->net, rule)) {
+ if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL)
+ return -ENOMEM;
+ nft_rule_disactivate_next(ctx->net, rule);
+ ctx->chain->use--;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int nf_table_delrule_by_chain(struct nft_ctx *ctx)
+{
+ struct nft_rule *rule;
+ int err;
+
+ list_for_each_entry(rule, &ctx->chain->rules, list) {
+ err = nf_tables_delrule_one(ctx, rule);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain = NULL;
+ struct nft_rule *rule;
+ int family = nfmsg->nfgen_family, err = 0;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ if (chain) {
+ if (nla[NFTA_RULE_HANDLE]) {
+ rule = nf_tables_rule_lookup(chain,
+ nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ err = nf_tables_delrule_one(&ctx, rule);
+ } else {
+ err = nf_table_delrule_by_chain(&ctx);
+ }
+ } else {
+ list_for_each_entry(chain, &table->chains, list) {
+ ctx.chain = chain;
+ err = nf_table_delrule_by_chain(&ctx);
+ if (err < 0)
+ break;
+ }
+ }
+
+ return err;
+}
+
+/*
+ * Sets
+ */
+
+static LIST_HEAD(nf_tables_set_ops);
+
+int nft_register_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&ops->list, &nf_tables_set_ops);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_set);
+
+void nft_unregister_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&ops->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_set);
+
+/*
+ * Select a set implementation based on the data characteristics and the
+ * given policy. The total memory use might not be known if no size is
+ * given, in that case the amount of memory per element is used.
+ */
+static const struct nft_set_ops *
+nft_select_set_ops(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc,
+ enum nft_set_policies policy)
+{
+ const struct nft_set_ops *ops, *bops;
+ struct nft_set_estimate est, best;
+ u32 features;
+
+#ifdef CONFIG_MODULES
+ if (list_empty(&nf_tables_set_ops)) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-set");
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (!list_empty(&nf_tables_set_ops))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ features = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ features &= NFT_SET_INTERVAL | NFT_SET_MAP;
+ }
+
+ bops = NULL;
+ best.size = ~0;
+ best.class = ~0;
+
+ list_for_each_entry(ops, &nf_tables_set_ops, list) {
+ if ((ops->features & features) != features)
+ continue;
+ if (!ops->estimate(desc, features, &est))
+ continue;
+
+ switch (policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ if (est.class < best.class)
+ break;
+ if (est.class == best.class && est.size < best.size)
+ break;
+ continue;
+ case NFT_SET_POL_MEMORY:
+ if (est.size < best.size)
+ break;
+ if (est.size == best.size && est.class < best.class)
+ break;
+ continue;
+ default:
+ break;
+ }
+
+ if (!try_module_get(ops->owner))
+ continue;
+ if (bops != NULL)
+ module_put(bops->owner);
+
+ bops = ops;
+ best = est;
+ }
+
+ if (bops != NULL)
+ return bops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+ [NFTA_SET_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_NAME] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFTA_SET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_POLICY] = { .type = NLA_U32 },
+ [NFTA_SET_DESC] = { .type = NLA_NESTED },
+ [NFTA_SET_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
+ [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi = NULL;
+ struct nft_table *table = NULL;
+
+ if (nfmsg->nfgen_family != NFPROTO_UNSPEC) {
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+ }
+
+ if (nla[NFTA_SET_TABLE] != NULL) {
+ if (afi == NULL)
+ return -EAFNOSUPPORT;
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ }
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_set *set;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(set, &table->sets, list) {
+ if (!nla_strcmp(nla, set->name))
+ return set;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
+ const struct nlattr *nla)
+{
+ struct nft_trans *trans;
+ u32 id = ntohl(nla_get_be32(nla));
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWSET &&
+ id == nft_trans_set_id(trans))
+ return nft_trans_set(trans);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+ const char *name)
+{
+ const struct nft_set *i;
+ const char *p;
+ unsigned long *inuse;
+ unsigned int n = 0, min = 0;
+
+ p = strnchr(name, IFNAMSIZ, '%');
+ if (p != NULL) {
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (inuse == NULL)
+ return -ENOMEM;
+cont:
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ int tmp;
+
+ if (!sscanf(i->name, name, &tmp))
+ continue;
+ if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
+ continue;
+
+ set_bit(tmp - min, inuse);
+ }
+
+ n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
+ if (n >= BITS_PER_BYTE * PAGE_SIZE) {
+ min += BITS_PER_BYTE * PAGE_SIZE;
+ memset(inuse, 0, PAGE_SIZE);
+ goto cont;
+ }
+ free_page((unsigned long)inuse);
+ }
+
+ snprintf(set->name, sizeof(set->name), name, min + n);
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ if (!strcmp(set->name, i->name))
+ return -ENFILE;
+ }
+ return 0;
+}
+
+static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
+ const struct nft_set *set, u16 event, u16 flags)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *desc;
+ u32 portid = ctx->portid;
+ u32 seq = ctx->seq;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+ if (set->flags != 0)
+ if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
+ goto nla_put_failure;
+ if (set->flags & NFT_SET_MAP) {
+ if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
+ goto nla_put_failure;
+ }
+
+ desc = nla_nest_start(skb, NFTA_SET_DESC);
+ if (desc == NULL)
+ goto nla_put_failure;
+ if (set->size &&
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ goto nla_put_failure;
+ nla_nest_end(skb, desc);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_set_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ int event, gfp_t gfp_flags)
+{
+ struct sk_buff *skb;
+ u32 portid = ctx->portid;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES,
+ ctx->report, gfp_flags);
+err:
+ if (err < 0)
+ nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx = 0, s_idx = cb->args[0];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = ctx->net->nft.base_seq;
+
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx, s_idx = cb->args[0];
+ struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = ctx->net->nft.base_seq;
+
+ list_for_each_entry_rcu(table, &ctx->afi->tables, list) {
+ if (cur_table) {
+ if (cur_table != table)
+ continue;
+
+ cur_table = NULL;
+ }
+ ctx->table = table;
+ idx = 0;
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ cb->args[2] = (unsigned long) table;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx, s_idx = cb->args[0];
+ struct nft_af_info *afi;
+ struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+ struct net *net = sock_net(skb->sk);
+ int cur_family = cb->args[3];
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (cur_family) {
+ if (afi->family != cur_family)
+ continue;
+
+ cur_family = 0;
+ }
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (cur_table) {
+ if (cur_table != table)
+ continue;
+
+ cur_table = NULL;
+ }
+
+ ctx->table = table;
+ ctx->afi = afi;
+ idx = 0;
+ list_for_each_entry_rcu(set, &ctx->table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (nf_tables_fill_set(skb, ctx, set,
+ NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ cb->args[2] = (unsigned long) table;
+ cb->args[3] = afi->family;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ if (s_idx)
+ s_idx = 0;
+ }
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nlattr *nla[NFTA_SET_MAX + 1];
+ struct nft_ctx ctx;
+ int err, ret;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX,
+ nft_set_policy);
+ if (err < 0)
+ return err;
+
+ err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla);
+ if (err < 0)
+ return err;
+
+ if (ctx.table == NULL) {
+ if (ctx.afi == NULL)
+ ret = nf_tables_dump_sets_all(&ctx, skb, cb);
+ else
+ ret = nf_tables_dump_sets_family(&ctx, skb, cb);
+ } else
+ ret = nf_tables_dump_sets_table(&ctx, skb, cb);
+
+ return ret;
+}
+
+#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */
+
+static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ struct sk_buff *skb2;
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ int err;
+
+ /* Verify existance before starting dump */
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_sets,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ /* Only accept unspec with dump */
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
+ struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *da[NFTA_SET_DESC_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy);
+ if (err < 0)
+ return err;
+
+ if (da[NFTA_SET_DESC_SIZE] != NULL)
+ desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+
+ return 0;
+}
+
+static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
+ nft_trans_set_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
+ set->flags |= NFT_SET_INACTIVE;
+ }
+ nft_trans_set(trans) = set;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_set_ops *ops;
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ char name[IFNAMSIZ];
+ unsigned int size;
+ bool create;
+ u32 ktype, dtype, flags, policy;
+ struct nft_set_desc desc;
+ int err;
+
+ if (nla[NFTA_SET_TABLE] == NULL ||
+ nla[NFTA_SET_NAME] == NULL ||
+ nla[NFTA_SET_KEY_LEN] == NULL ||
+ nla[NFTA_SET_ID] == NULL)
+ return -EINVAL;
+
+ memset(&desc, 0, sizeof(desc));
+
+ ktype = NFT_DATA_VALUE;
+ if (nla[NFTA_SET_KEY_TYPE] != NULL) {
+ ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+ if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+ return -EINVAL;
+ }
+
+ desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+ if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+
+ flags = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
+ NFT_SET_INTERVAL | NFT_SET_MAP))
+ return -EINVAL;
+ }
+
+ dtype = 0;
+ if (nla[NFTA_SET_DATA_TYPE] != NULL) {
+ if (!(flags & NFT_SET_MAP))
+ return -EINVAL;
+
+ dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+ if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+ dtype != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (dtype != NFT_DATA_VERDICT) {
+ if (nla[NFTA_SET_DATA_LEN] == NULL)
+ return -EINVAL;
+ desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
+ if (desc.dlen == 0 ||
+ desc.dlen > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+ } else
+ desc.dlen = sizeof(struct nft_data);
+ } else if (flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL)
+ policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+
+ if (nla[NFTA_SET_DESC] != NULL) {
+ err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ if (err < 0)
+ return err;
+ }
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+
+ set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set)) {
+ if (PTR_ERR(set) != -ENOENT)
+ return PTR_ERR(set);
+ set = NULL;
+ }
+
+ if (set != NULL) {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+ return 0;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ ops = nft_select_set_ops(nla, &desc, policy);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ size = 0;
+ if (ops->privsize != NULL)
+ size = ops->privsize(nla);
+
+ err = -ENOMEM;
+ set = kzalloc(sizeof(*set) + size, GFP_KERNEL);
+ if (set == NULL)
+ goto err1;
+
+ nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+ err = nf_tables_set_alloc_name(&ctx, set, name);
+ if (err < 0)
+ goto err2;
+
+ INIT_LIST_HEAD(&set->bindings);
+ set->ops = ops;
+ set->ktype = ktype;
+ set->klen = desc.klen;
+ set->dtype = dtype;
+ set->dlen = desc.dlen;
+ set->flags = flags;
+ set->size = desc.size;
+
+ err = ops->init(set, &desc, nla);
+ if (err < 0)
+ goto err2;
+
+ err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&set->list, &table->sets);
+ table->use++;
+ return 0;
+
+err2:
+ kfree(set);
+err1:
+ module_put(ops->owner);
+ return err;
+}
+
+static void nft_set_destroy(struct nft_set *set)
+{
+ set->ops->destroy(set);
+ module_put(set->ops->owner);
+ kfree(set);
+}
+
+static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ list_del_rcu(&set->list);
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+ nft_set_destroy(set);
+}
+
+static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+ if (nla[NFTA_SET_TABLE] == NULL)
+ return -EINVAL;
+
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+ if (!list_empty(&set->bindings))
+ return -EBUSY;
+
+ err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&set->list);
+ ctx.table->use--;
+ return 0;
+}
+
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ enum nft_registers dreg;
+
+ dreg = nft_type_to_reg(set->dtype);
+ return nft_validate_data_load(ctx, dreg, &elem->data,
+ set->dtype == NFT_DATA_VERDICT ?
+ NFT_DATA_VERDICT : NFT_DATA_VALUE);
+}
+
+int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ struct nft_set_binding *i;
+ struct nft_set_iter iter;
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+ return -EBUSY;
+
+ if (set->flags & NFT_SET_MAP) {
+ /* If the set is already bound to the same chain all
+ * jumps are already validated for that chain.
+ */
+ list_for_each_entry(i, &set->bindings, list) {
+ if (i->chain == binding->chain)
+ goto bind;
+ }
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_bind_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0) {
+ /* Destroy anonymous sets if binding fails */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ nf_tables_set_destroy(ctx, set);
+
+ return iter.err;
+ }
+ }
+bind:
+ binding->chain = ctx->chain;
+ list_add_tail_rcu(&binding->list, &set->bindings);
+ return 0;
+}
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ list_del_rcu(&binding->list);
+
+ if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
+ !(set->flags & NFT_SET_INACTIVE))
+ nf_tables_set_destroy(ctx, set);
+}
+
+/*
+ * Set elements
+ */
+
+static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
+ [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
+ [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[],
+ bool trans)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (!trans && (table->flags & NFT_TABLE_INACTIVE))
+ return -ENOENT;
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+static int nf_tables_fill_setelem(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE,
+ set->klen) < 0)
+ goto nla_put_failure;
+
+ if (set->flags & NFT_SET_MAP &&
+ !(elem->flags & NFT_SET_ELEM_INTERVAL_END) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data,
+ set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
+ set->dlen) < 0)
+ goto nla_put_failure;
+
+ if (elem->flags != 0)
+ if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+struct nft_set_dump_args {
+ const struct netlink_callback *cb;
+ struct nft_set_iter iter;
+ struct sk_buff *skb;
+};
+
+static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ struct nft_set_dump_args *args;
+
+ args = container_of(iter, struct nft_set_dump_args, iter);
+ return nf_tables_fill_setelem(args->skb, set, elem);
+}
+
+static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ struct nft_set_dump_args args;
+ struct nft_ctx ctx;
+ struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ u32 portid, seq;
+ int event, err;
+
+ err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla,
+ NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy);
+ if (err < 0)
+ return err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
+ false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ event = NFT_MSG_NEWSETELEM;
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ NLM_F_MULTI);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx.afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
+ set->ops->walk(&ctx, set, &args.iter);
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+
+ if (args.iter.err && args.iter.err != -EMSGSIZE)
+ return args.iter.err;
+ if (args.iter.count == cb->args[0])
+ return 0;
+
+ cb->args[0] = args.iter.count;
+ return skb->len;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_set,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int nf_tables_fill_setelem_info(struct sk_buff *skb,
+ const struct nft_ctx *ctx, u32 seq,
+ u32 portid, int event, u16 flags,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ int err;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ err = nf_tables_fill_setelem(skb, set, elem);
+ if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ int event, u16 flags)
+{
+ struct net *net = ctx->net;
+ u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
+ set, elem);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
+ GFP_KERNEL);
+err:
+ if (err < 0)
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+ int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_elem_set(trans) = set;
+ return trans;
+}
+
+static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc d1, d2;
+ struct nft_set_elem elem;
+ struct nft_set_binding *binding;
+ enum nft_registers dreg;
+ struct nft_trans *trans;
+ int err;
+
+ if (set->size && set->nelems == set->size)
+ return -ENFILE;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ return err;
+
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ return -EINVAL;
+
+ elem.flags = 0;
+ if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
+ elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
+ if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ }
+
+ if (set->flags & NFT_SET_MAP) {
+ if (nla[NFTA_SET_ELEM_DATA] == NULL &&
+ !(elem.flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ if (nla[NFTA_SET_ELEM_DATA] != NULL &&
+ elem.flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ return -EINVAL;
+ }
+
+ err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+ err = -EINVAL;
+ if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
+ goto err2;
+
+ err = -EEXIST;
+ if (set->ops->get(set, &elem) == 0)
+ goto err2;
+
+ if (nla[NFTA_SET_ELEM_DATA] != NULL) {
+ err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]);
+ if (err < 0)
+ goto err2;
+
+ err = -EINVAL;
+ if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
+ goto err3;
+
+ dreg = nft_type_to_reg(set->dtype);
+ list_for_each_entry(binding, &set->bindings, list) {
+ struct nft_ctx bind_ctx = {
+ .afi = ctx->afi,
+ .table = ctx->table,
+ .chain = (struct nft_chain *)binding->chain,
+ };
+
+ err = nft_validate_data_load(&bind_ctx, dreg,
+ &elem.data, d2.type);
+ if (err < 0)
+ goto err3;
+ }
+ }
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
+ if (trans == NULL)
+ goto err3;
+
+ err = set->ops->insert(set, &elem);
+ if (err < 0)
+ goto err4;
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+
+err4:
+ kfree(trans);
+err3:
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ nft_data_uninit(&elem.data, d2.type);
+err2:
+ nft_data_uninit(&elem.key, d1.type);
+err1:
+ return err;
+}
+
+static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set)) {
+ if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
+ set = nf_tables_set_lookup_byid(net,
+ nla[NFTA_SET_ELEM_LIST_SET_ID]);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_add_set_elem(&ctx, set, attr);
+ if (err < 0)
+ break;
+
+ set->nelems++;
+ }
+ return err;
+}
+
+static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc desc;
+ struct nft_set_elem elem;
+ struct nft_trans *trans;
+ int err;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ goto err1;
+
+ err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+ goto err2;
+
+ err = set->ops->get(set, &elem);
+ if (err < 0)
+ goto err2;
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
+ if (trans == NULL)
+ goto err2;
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ nft_data_uninit(&elem.key, NFT_DATA_VALUE);
+ if (set->flags & NFT_SET_MAP)
+ nft_data_uninit(&elem.data, set->dtype);
+
+err2:
+ nft_data_uninit(&elem.key, desc.type);
+err1:
+ return err;
+}
+
+static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_del_setelem(&ctx, set, attr);
+ if (err < 0)
+ break;
+
+ set->nelems--;
+ }
+ return err;
+}
+
+static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
+ [NFT_MSG_NEWTABLE] = {
+ .call_batch = nf_tables_newtable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_GETTABLE] = {
+ .call = nf_tables_gettable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_DELTABLE] = {
+ .call_batch = nf_tables_deltable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_NEWCHAIN] = {
+ .call_batch = nf_tables_newchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_GETCHAIN] = {
+ .call = nf_tables_getchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_DELCHAIN] = {
+ .call_batch = nf_tables_delchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_NEWRULE] = {
+ .call_batch = nf_tables_newrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_GETRULE] = {
+ .call = nf_tables_getrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_DELRULE] = {
+ .call_batch = nf_tables_delrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_NEWSET] = {
+ .call_batch = nf_tables_newset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_GETSET] = {
+ .call = nf_tables_getset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_DELSET] = {
+ .call_batch = nf_tables_delset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_NEWSETELEM] = {
+ .call_batch = nf_tables_newsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETSETELEM] = {
+ .call = nf_tables_getsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_DELSETELEM] = {
+ .call_batch = nf_tables_delsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+};
+
+static void nft_chain_commit_update(struct nft_trans *trans)
+{
+ struct nft_base_chain *basechain;
+
+ if (nft_trans_chain_name(trans)[0])
+ strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
+
+ if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN))
+ return;
+
+ basechain = nft_base_chain(trans->ctx.chain);
+ nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans));
+
+ switch (nft_trans_chain_policy(trans)) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ basechain->policy = nft_trans_chain_policy(trans);
+ break;
+ }
+}
+
+/* Schedule objects for release via rcu to make sure no packets are accesing
+ * removed rules.
+ */
+static void nf_tables_commit_release_rcu(struct rcu_head *rt)
+{
+ struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
+
+ switch (trans->msg_type) {
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_DELRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_DELSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_commit(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_set *set;
+
+ /* Bump generation counter, invalidate any dump in progress */
+ while (++net->nft.base_seq == 0);
+
+ /* A new generation has just started */
+ net->nft.gencursor = gencursor_next(net);
+
+ /* Make sure all packets have left the previous generation before
+ * purging old rules.
+ */
+ synchronize_rcu();
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (!nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ } else {
+ trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE;
+ }
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_chain_commit_update(trans);
+ else
+ trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE;
+
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
+ if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) &&
+ trans->ctx.chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops,
+ trans->ctx.afi->nops);
+ }
+ break;
+ case NFT_MSG_NEWRULE:
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_NEWRULE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELRULE:
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_DELRULE);
+ break;
+ case NFT_MSG_NEWSET:
+ nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE;
+ /* This avoids hitting -EBUSY when deleting the table
+ * from the transaction.
+ */
+ if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS &&
+ !list_empty(&nft_trans_set(trans)->bindings))
+ trans->ctx.table->use--;
+
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_NEWSET, GFP_KERNEL);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSET:
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_DELSET, GFP_KERNEL);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ nf_tables_setelem_notify(&trans->ctx,
+ nft_trans_elem_set(trans),
+ &nft_trans_elem(trans),
+ NFT_MSG_NEWSETELEM, 0);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSETELEM:
+ nf_tables_setelem_notify(&trans->ctx,
+ nft_trans_elem_set(trans),
+ &nft_trans_elem(trans),
+ NFT_MSG_DELSETELEM, 0);
+ set = nft_trans_elem_set(trans);
+ set->ops->get(set, &nft_trans_elem(trans));
+ set->ops->remove(set, &nft_trans_elem(trans));
+ nft_trans_destroy(trans);
+ break;
+ }
+ }
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ trans->ctx.nla = NULL;
+ call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu);
+ }
+
+ return 0;
+}
+
+/* Schedule objects for release via rcu to make sure no packets are accesing
+ * aborted rules.
+ */
+static void nf_tables_abort_release_rcu(struct rcu_head *rt)
+{
+ struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_NEWRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_NEWSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_abort(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_set *set;
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ nft_trans_destroy(trans);
+ } else {
+ list_del_rcu(&trans->ctx.table->list);
+ }
+ break;
+ case NFT_MSG_DELTABLE:
+ list_add_tail_rcu(&trans->ctx.table->list,
+ &trans->ctx.afi->tables);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ if (nft_trans_chain_stats(trans))
+ free_percpu(nft_trans_chain_stats(trans));
+
+ nft_trans_destroy(trans);
+ } else {
+ trans->ctx.table->use--;
+ list_del_rcu(&trans->ctx.chain->list);
+ if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) &&
+ trans->ctx.chain->flags & NFT_BASE_CHAIN) {
+ nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops,
+ trans->ctx.afi->nops);
+ }
+ }
+ break;
+ case NFT_MSG_DELCHAIN:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&trans->ctx.chain->list,
+ &trans->ctx.table->chains);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWRULE:
+ trans->ctx.chain->use--;
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ break;
+ case NFT_MSG_DELRULE:
+ trans->ctx.chain->use++;
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSET:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_set(trans)->list);
+ break;
+ case NFT_MSG_DELSET:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&nft_trans_set(trans)->list,
+ &trans->ctx.table->sets);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ nft_trans_elem_set(trans)->nelems--;
+ set = nft_trans_elem_set(trans);
+ set->ops->get(set, &nft_trans_elem(trans));
+ set->ops->remove(set, &nft_trans_elem(trans));
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSETELEM:
+ nft_trans_elem_set(trans)->nelems++;
+ nft_trans_destroy(trans);
+ break;
+ }
+ }
+
+ list_for_each_entry_safe_reverse(trans, next,
+ &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ trans->ctx.nla = NULL;
+ call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu);
+ }
+
+ return 0;
+}
+
+static const struct nfnetlink_subsystem nf_tables_subsys = {
+ .name = "nf_tables",
+ .subsys_id = NFNL_SUBSYS_NFTABLES,
+ .cb_count = NFT_MSG_MAX,
+ .cb = nf_tables_cb,
+ .commit = nf_tables_commit,
+ .abort = nf_tables_abort,
+};
+
+/*
+ * Loop detection - walk through the ruleset beginning at the destination chain
+ * of a new jump until either the source chain is reached (loop) or all
+ * reachable chains have been traversed.
+ *
+ * The loop check is performed whenever a new jump verdict is added to an
+ * expression or verdict map or a verdict map is bound to a new chain.
+ */
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain);
+
+static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ if (elem->flags & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ switch (elem->data.verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ return nf_tables_check_loops(ctx, elem->data.chain);
+ default:
+ return 0;
+ }
+}
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain)
+{
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ const struct nft_set *set;
+ struct nft_set_binding *binding;
+ struct nft_set_iter iter;
+
+ if (ctx->chain == chain)
+ return -ELOOP;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ nft_rule_for_each_expr(expr, last, rule) {
+ const struct nft_data *data = NULL;
+ int err;
+
+ if (!expr->ops->validate)
+ continue;
+
+ err = expr->ops->validate(ctx, expr, &data);
+ if (err < 0)
+ return err;
+
+ if (data == NULL)
+ continue;
+
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nf_tables_check_loops(ctx, data->chain);
+ if (err < 0)
+ return err;
+ default:
+ break;
+ }
+ }
+ }
+
+ list_for_each_entry(set, &ctx->table->sets, list) {
+ if (!(set->flags & NFT_SET_MAP) ||
+ set->dtype != NFT_DATA_VERDICT)
+ continue;
+
+ list_for_each_entry(binding, &set->bindings, list) {
+ if (binding->chain != chain)
+ continue;
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_loop_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0)
+ return iter.err;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * nft_validate_input_register - validate an expressions' input register
+ *
+ * @reg: the register number
+ *
+ * Validate that the input register is one of the general purpose
+ * registers.
+ */
+int nft_validate_input_register(enum nft_registers reg)
+{
+ if (reg <= NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_input_register);
+
+/**
+ * nft_validate_output_register - validate an expressions' output register
+ *
+ * @reg: the register number
+ *
+ * Validate that the output register is one of the general purpose
+ * registers or the verdict register.
+ */
+int nft_validate_output_register(enum nft_registers reg)
+{
+ if (reg < NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_output_register);
+
+/**
+ * nft_validate_data_load - validate an expressions' data load
+ *
+ * @ctx: context of the expression performing the load
+ * @reg: the destination register number
+ * @data: the data to load
+ * @type: the data type
+ *
+ * Validate that a data load uses the appropriate data type for
+ * the destination register. A value of NULL for the data means
+ * that its runtime gathered data, which is always of type
+ * NFT_DATA_VALUE.
+ */
+int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type)
+{
+ int err;
+
+ switch (reg) {
+ case NFT_REG_VERDICT:
+ if (data == NULL || type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) {
+ err = nf_tables_check_loops(ctx, data->chain);
+ if (err < 0)
+ return err;
+
+ if (ctx->chain->level + 1 > data->chain->level) {
+ if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
+ return -EMLINK;
+ data->chain->level = ctx->chain->level + 1;
+ }
+ }
+
+ return 0;
+ default:
+ if (data != NULL && type != NFT_DATA_VALUE)
+ return -EINVAL;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_validate_data_load);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+ [NFTA_VERDICT_CODE] = { .type = NLA_U32 },
+ [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+};
+
+static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+ struct nft_chain *chain;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_VERDICT_CODE])
+ return -EINVAL;
+ data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+
+ switch (data->verdict) {
+ default:
+ switch (data->verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* fall through */
+ case NFT_CONTINUE:
+ case NFT_BREAK:
+ case NFT_RETURN:
+ desc->len = sizeof(data->verdict);
+ break;
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (!tb[NFTA_VERDICT_CHAIN])
+ return -EINVAL;
+ chain = nf_tables_chain_lookup(ctx->table,
+ tb[NFTA_VERDICT_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_BASE_CHAIN)
+ return -EOPNOTSUPP;
+
+ chain->use++;
+ data->chain = chain;
+ desc->len = sizeof(data);
+ break;
+ }
+
+ desc->type = NFT_DATA_VERDICT;
+ return 0;
+}
+
+static void nft_verdict_uninit(const struct nft_data *data)
+{
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ data->chain->use--;
+ break;
+ }
+}
+
+static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict)))
+ goto nla_put_failure;
+
+ switch (data->verdict) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ unsigned int len;
+
+ len = nla_len(nla);
+ if (len == 0)
+ return -EINVAL;
+ if (len > sizeof(data->data))
+ return -EOVERFLOW;
+
+ nla_memcpy(data->data, nla, sizeof(data->data));
+ desc->type = NFT_DATA_VALUE;
+ desc->len = len;
+ return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+ unsigned int len)
+{
+ return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+ [NFTA_DATA_VALUE] = { .type = NLA_BINARY,
+ .len = FIELD_SIZEOF(struct nft_data, data) },
+ [NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
+};
+
+/**
+ * nft_data_init - parse nf_tables data netlink attributes
+ *
+ * @ctx: context of the expression using the data
+ * @data: destination struct nft_data
+ * @desc: data description
+ * @nla: netlink attribute containing data
+ *
+ * Parse the netlink data attributes and initialize a struct nft_data.
+ * The type and length of data are returned in the data description.
+ *
+ * The caller can indicate that it only wants to accept data of type
+ * NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_DATA_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DATA_VALUE])
+ return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
+ return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+/**
+ * nft_data_uninit - release a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ *
+ * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+{
+ switch (type) {
+ case NFT_DATA_VALUE:
+ return;
+ case NFT_DATA_VERDICT:
+ return nft_verdict_uninit(data);
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start(skb, attr);
+ if (nest == NULL)
+ return -1;
+
+ switch (type) {
+ case NFT_DATA_VALUE:
+ err = nft_value_dump(skb, data, len);
+ break;
+ case NFT_DATA_VERDICT:
+ err = nft_verdict_dump(skb, data);
+ break;
+ default:
+ err = -EINVAL;
+ WARN_ON(1);
+ }
+
+ nla_nest_end(skb, nest);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+static int nf_tables_init_net(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nft.af_info);
+ INIT_LIST_HEAD(&net->nft.commit_list);
+ net->nft.base_seq = 1;
+ return 0;
+}
+
+static struct pernet_operations nf_tables_net_ops = {
+ .init = nf_tables_init_net,
+};
+
+static int __init nf_tables_module_init(void)
+{
+ int err;
+
+ info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
+ GFP_KERNEL);
+ if (info == NULL) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = nf_tables_core_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nfnetlink_subsys_register(&nf_tables_subsys);
+ if (err < 0)
+ goto err3;
+
+ pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
+ return register_pernet_subsys(&nf_tables_net_ops);
+err3:
+ nf_tables_core_module_exit();
+err2:
+ kfree(info);
+err1:
+ return err;
+}
+
+static void __exit nf_tables_module_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_net_ops);
+ nfnetlink_subsys_unregister(&nf_tables_subsys);
+ nf_tables_core_module_exit();
+ kfree(info);
+}
+
+module_init(nf_tables_module_init);
+module_exit(nf_tables_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 00000000000..3b90eb2b2c5
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+
+static void nft_cmp_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1])
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ u32 mask = nft_cmp_fast_mask(priv->len);
+
+ if ((data[priv->sreg].data[0] & mask) == priv->data)
+ return;
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static bool nft_payload_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ struct nft_data *dest = &data[priv->dreg];
+ unsigned char *ptr;
+
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
+ ptr = skb_network_header(skb);
+ else
+ ptr = skb_network_header(skb) + pkt->xt.thoff;
+
+ ptr += priv->offset;
+
+ if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ return false;
+
+ if (priv->len == 2)
+ *(u16 *)dest->data = *(u16 *)ptr;
+ else if (priv->len == 4)
+ *(u32 *)dest->data = *(u32 *)ptr;
+ else
+ *(u8 *)dest->data = *(u8 *)ptr;
+ return true;
+}
+
+struct nft_jumpstack {
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ int rulenum;
+};
+
+enum nft_trace {
+ NFT_TRACE_RULE,
+ NFT_TRACE_RETURN,
+ NFT_TRACE_POLICY,
+};
+
+static const char *const comments[] = {
+ [NFT_TRACE_RULE] = "rule",
+ [NFT_TRACE_RETURN] = "return",
+ [NFT_TRACE_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+static void nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace type)
+{
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type],
+ rulenum);
+}
+
+unsigned int
+nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+{
+ const struct nft_chain *chain = ops->priv, *basechain = chain;
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ struct nft_data data[NFT_REG_MAX + 1];
+ unsigned int stackptr = 0;
+ struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
+ struct nft_stats *stats;
+ int rulenum;
+ /*
+ * Cache cursor to avoid problems in case that the cursor is updated
+ * while traversing the ruleset.
+ */
+ unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor);
+
+do_chain:
+ rulenum = 0;
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+
+ /* This rule is not active, skip. */
+ if (unlikely(rule->genmask & (1 << gencursor)))
+ continue;
+
+ rulenum++;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (expr->ops == &nft_cmp_fast_ops)
+ nft_cmp_fast_eval(expr, data);
+ else if (expr->ops != &nft_payload_fast_ops ||
+ !nft_payload_fast_eval(expr, data, pkt))
+ expr->ops->eval(expr, data, pkt);
+
+ if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_BREAK:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ continue;
+ case NFT_CONTINUE:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ continue;
+ }
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ return data[NFT_REG_VERDICT].verdict;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_JUMP:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+ jumpstack[stackptr].chain = chain;
+ jumpstack[stackptr].rule = rule;
+ jumpstack[stackptr].rulenum = rulenum;
+ stackptr++;
+ chain = data[NFT_REG_VERDICT].chain;
+ goto do_chain;
+ case NFT_GOTO:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ chain = data[NFT_REG_VERDICT].chain;
+ goto do_chain;
+ case NFT_RETURN:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
+ break;
+ case NFT_CONTINUE:
+ if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN)))
+ nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ if (stackptr > 0) {
+ stackptr--;
+ chain = jumpstack[stackptr].chain;
+ rule = jumpstack[stackptr].rule;
+ rulenum = jumpstack[stackptr].rulenum;
+ goto next_rule;
+ }
+
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY);
+
+ rcu_read_lock_bh();
+ stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats));
+ u64_stats_update_begin(&stats->syncp);
+ stats->pkts++;
+ stats->bytes += pkt->skb->len;
+ u64_stats_update_end(&stats->syncp);
+ rcu_read_unlock_bh();
+
+ return nft_base_chain(basechain)->policy;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+ int err;
+
+ err = nft_immediate_module_init();
+ if (err < 0)
+ goto err1;
+
+ err = nft_cmp_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nft_lookup_module_init();
+ if (err < 0)
+ goto err3;
+
+ err = nft_bitwise_module_init();
+ if (err < 0)
+ goto err4;
+
+ err = nft_byteorder_module_init();
+ if (err < 0)
+ goto err5;
+
+ err = nft_payload_module_init();
+ if (err < 0)
+ goto err6;
+
+ return 0;
+
+err6:
+ nft_byteorder_module_exit();
+err5:
+ nft_bitwise_module_exit();
+err4:
+ nft_lookup_module_exit();
+err3:
+ nft_cmp_module_exit();
+err2:
+ nft_immediate_module_exit();
+err1:
+ return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+ nft_payload_module_exit();
+ nft_byteorder_module_exit();
+ nft_bitwise_module_exit();
+ nft_lookup_module_exit();
+ nft_cmp_module_exit();
+ nft_immediate_module_exit();
+}
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
new file mode 100644
index 00000000000..9dd2d216cfc
--- /dev/null
+++ b/net/netfilter/nf_tables_inet.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/ip.h>
+
+static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n)
+{
+ struct nft_af_info *afi;
+
+ if (n == 1)
+ afi = &nft_af_ipv4;
+ else
+ afi = &nft_af_ipv6;
+
+ ops->pf = afi->family;
+ if (afi->hooks[ops->hooknum])
+ ops->hook = afi->hooks[ops->hooknum];
+}
+
+static struct nft_af_info nft_af_inet __read_mostly = {
+ .family = NFPROTO_INET,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 2,
+ .hook_ops_init = nft_inet_hook_ops_init,
+};
+
+static int __net_init nf_tables_inet_init_net(struct net *net)
+{
+ net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.inet == NULL)
+ return -ENOMEM;
+ memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet));
+
+ if (nft_register_afinfo(net, net->nft.inet) < 0)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(net->nft.inet);
+ return -ENOMEM;
+}
+
+static void __net_exit nf_tables_inet_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.inet);
+ kfree(net->nft.inet);
+}
+
+static struct pernet_operations nf_tables_inet_net_ops = {
+ .init = nf_tables_inet_init_net,
+ .exit = nf_tables_inet_exit_net,
+};
+
+static const struct nf_chain_type filter_inet = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_INET,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_inet_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_inet);
+ ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_inet);
+
+ return ret;
+}
+
+static void __exit nf_tables_inet_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_inet_net_ops);
+ nft_unregister_chain_type(&filter_inet);
+}
+
+module_init(nf_tables_inet_init);
+module_exit(nf_tables_inet_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(1);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index f6063e8f005..c138b8fbe28 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
*
* (C) 2001 by Jay Schulist <jschlst@samba.org>,
* (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
*
* Initial netfilter messages via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -14,27 +14,19 @@
* of the GNU General Public License, incorporated herein by reference.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
-#include <linux/fcntl.h>
#include <linux/skbuff.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <net/sock.h>
#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/netfilter.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
@@ -43,330 +35,434 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
static char __initdata nfversion[] = "0.30";
-#if 0
-#define DEBUGP(format, args...) \
- printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
- __LINE__, __FUNCTION__, ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
-static struct sock *nfnl = NULL;
-static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
-DECLARE_MUTEX(nfnl_sem);
-
-void nfnl_lock(void)
+static struct {
+ struct mutex mutex;
+ const struct nfnetlink_subsystem __rcu *subsys;
+} table[NFNL_SUBSYS_COUNT];
+
+static const int nfnl_group2type[NFNLGRP_MAX+1] = {
+ [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+};
+
+void nfnl_lock(__u8 subsys_id)
{
- nfnl_shlock();
+ mutex_lock(&table[subsys_id].mutex);
}
+EXPORT_SYMBOL_GPL(nfnl_lock);
-void nfnl_unlock(void)
+void nfnl_unlock(__u8 subsys_id)
{
- nfnl_shunlock();
+ mutex_unlock(&table[subsys_id].mutex);
}
+EXPORT_SYMBOL_GPL(nfnl_unlock);
-int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_nfnl_is_held(u8 subsys_id)
{
- DEBUGP("registering subsystem ID %u\n", n->subsys_id);
+ return lockdep_is_held(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
+#endif
- nfnl_lock();
- if (subsys_table[n->subsys_id]) {
- nfnl_unlock();
+int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
+{
+ nfnl_lock(n->subsys_id);
+ if (table[n->subsys_id].subsys) {
+ nfnl_unlock(n->subsys_id);
return -EBUSY;
}
- subsys_table[n->subsys_id] = n;
- nfnl_unlock();
+ rcu_assign_pointer(table[n->subsys_id].subsys, n);
+ nfnl_unlock(n->subsys_id);
return 0;
}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
-int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
+int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
{
- DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
-
- nfnl_lock();
- subsys_table[n->subsys_id] = NULL;
- nfnl_unlock();
-
+ nfnl_lock(n->subsys_id);
+ table[n->subsys_id].subsys = NULL;
+ nfnl_unlock(n->subsys_id);
+ synchronize_rcu();
return 0;
}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
-static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
{
u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
- if (subsys_id >= NFNL_SUBSYS_COUNT
- || subsys_table[subsys_id] == NULL)
+ if (subsys_id >= NFNL_SUBSYS_COUNT)
return NULL;
- return subsys_table[subsys_id];
+ return rcu_dereference(table[subsys_id].subsys);
}
-static inline struct nfnl_callback *
-nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
+static inline const struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
{
u_int8_t cb_id = NFNL_MSG_TYPE(type);
-
- if (cb_id >= ss->cb_count) {
- DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
+
+ if (cb_id >= ss->cb_count)
return NULL;
- }
return &ss->cb[cb_id];
}
-void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
- const void *data)
+int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
- struct nfattr *nfa;
- int size = NFA_LENGTH(attrlen);
-
- nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
- nfa->nfa_type = attrtype;
- nfa->nfa_len = size;
- memcpy(NFA_DATA(nfa), data, attrlen);
- memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
+ return netlink_has_listeners(net->nfnl, group);
}
+EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
-void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+ u32 dst_portid, gfp_t gfp_mask)
{
- memset(tb, 0, sizeof(struct nfattr *) * maxattr);
-
- while (NFA_OK(nfa, len)) {
- unsigned flavor = NFA_TYPE(nfa);
- if (flavor && flavor <= maxattr)
- tb[flavor-1] = nfa;
- nfa = NFA_NEXT(nfa, len);
- }
+ return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
-/**
- * nfnetlink_check_attributes - check and parse nfnetlink attributes
- *
- * subsys: nfnl subsystem for which this message is to be parsed
- * nlmsghdr: netlink message to be checked/parsed
- * cda: array of pointers, needs to be at least subsys->attr_count big
- *
- */
-static int
-nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
- struct nlmsghdr *nlh, struct nfattr *cda[])
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
+ unsigned int group, int echo, gfp_t flags)
{
- int min_len;
- u_int16_t attr_count;
- u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-
- if (unlikely(cb_id >= subsys->cb_count)) {
- DEBUGP("msgtype %u >= %u, returning\n",
- cb_id, subsys->cb_count);
- return -EINVAL;
- }
-
- min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
- if (unlikely(nlh->nlmsg_len < min_len))
- return -EINVAL;
-
- attr_count = subsys->cb[cb_id].attr_count;
- memset(cda, 0, sizeof(struct nfattr *) * attr_count);
-
- /* check attribute lengths. */
- if (likely(nlh->nlmsg_len > min_len)) {
- struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
- int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
-
- while (NFA_OK(attr, attrlen)) {
- unsigned flavor = NFA_TYPE(attr);
- if (flavor) {
- if (flavor > attr_count)
- return -EINVAL;
- cda[flavor - 1] = attr;
- }
- attr = NFA_NEXT(attr, attrlen);
- }
- }
-
- /* implicit: if nlmsg_len == min_len, we return 0, and an empty
- * (zeroed) cda[] array. The message is valid, but empty. */
-
- return 0;
+ return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
}
+EXPORT_SYMBOL_GPL(nfnetlink_send);
-int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
- int err = 0;
-
- NETLINK_CB(skb).dst_group = group;
- if (echo)
- atomic_inc(&skb->users);
- netlink_broadcast(nfnl, skb, pid, group, allocation);
- if (echo)
- err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
-
- return err;
+ return netlink_set_err(net->nfnl, portid, group, error);
}
+EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
+ int flags)
{
- return netlink_unicast(nfnl, skb, pid, flags);
+ return netlink_unicast(net->nfnl, skb, portid, flags);
}
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
/* Process one complete nfnetlink message. */
-static int nfnetlink_rcv_msg(struct sk_buff *skb,
- struct nlmsghdr *nlh, int *errp)
+static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct nfnl_callback *nc;
- struct nfnetlink_subsystem *ss;
- int type, err = 0;
-
- DEBUGP("entered; subsys=%u, msgtype=%u\n",
- NFNL_SUBSYS_ID(nlh->nlmsg_type),
- NFNL_MSG_TYPE(nlh->nlmsg_type));
-
- if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
- DEBUGP("missing CAP_NET_ADMIN\n");
- *errp = -EPERM;
- return -1;
- }
-
- /* Only requests are handled by kernel now. */
- if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
- DEBUGP("received non-request message\n");
- return 0;
- }
+ struct net *net = sock_net(skb->sk);
+ const struct nfnl_callback *nc;
+ const struct nfnetlink_subsystem *ss;
+ int type, err;
/* All the messages must at least contain nfgenmsg */
- if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) {
- DEBUGP("received message was too short\n");
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
return 0;
- }
type = nlh->nlmsg_type;
+replay:
+ rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss) {
-#ifdef CONFIG_KMOD
- /* don't call nfnl_shunlock, since it would reenter
- * with further packet processing */
- up(&nfnl_sem);
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
- nfnl_shlock();
+ rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss)
#endif
- goto err_inval;
+ {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
}
nc = nfnetlink_find_client(type, ss);
if (!nc) {
- DEBUGP("unable to find client for type %d\n", type);
- goto err_inval;
+ rcu_read_unlock();
+ return -EINVAL;
}
{
- u_int16_t attr_count =
- ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
- struct nfattr *cda[attr_count];
-
- memset(cda, 0, sizeof(struct nfattr *) * attr_count);
-
- err = nfnetlink_check_attributes(ss, nlh, cda);
- if (err < 0)
- goto err_inval;
-
- DEBUGP("calling handler\n");
- err = nc->call(nfnl, skb, nlh, cda, errp);
- *errp = err;
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ int attrlen = nlh->nlmsg_len - min_len;
+ __u8 subsys_id = NFNL_SUBSYS_ID(type);
+
+ err = nla_parse(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen, ss->cb[cb_id].policy);
+ if (err < 0) {
+ rcu_read_unlock();
+ return err;
+ }
+
+ if (nc->call_rcu) {
+ err = nc->call_rcu(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ nfnl_lock(subsys_id);
+ if (rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex)) != ss ||
+ nfnetlink_find_client(type, ss) != nc)
+ err = -EAGAIN;
+ else if (nc->call)
+ err = nc->call(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ else
+ err = -EINVAL;
+ nfnl_unlock(subsys_id);
+ }
+ if (err == -EAGAIN)
+ goto replay;
return err;
}
-
-err_inval:
- DEBUGP("returning -EINVAL\n");
- *errp = -EINVAL;
- return -1;
}
-/* Process one packet of messages. */
-static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
+static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
+ u_int16_t subsys_id)
{
+ struct sk_buff *nskb, *oskb = skb;
+ struct net *net = sock_net(skb->sk);
+ const struct nfnetlink_subsystem *ss;
+ const struct nfnl_callback *nc;
+ bool success = true, done = false;
int err;
- struct nlmsghdr *nlh;
-
- while (skb->len >= NLMSG_SPACE(0)) {
- u32 rlen;
-
- nlh = (struct nlmsghdr *)skb->data;
- if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
- || skb->len < nlh->nlmsg_len)
- return 0;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if (nfnetlink_rcv_msg(skb, nlh, &err)) {
- if (!err)
- return -1;
- netlink_ack(skb, nlh, err);
- } else
- if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT)
+ return netlink_ack(skb, nlh, -EINVAL);
+replay:
+ nskb = netlink_skb_clone(oskb, GFP_KERNEL);
+ if (!nskb)
+ return netlink_ack(oskb, nlh, -ENOMEM);
+
+ nskb->sk = oskb->sk;
+ skb = nskb;
+
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(subsys_id);
+ request_module("nfnetlink-subsys-%d", subsys_id);
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss)
+#endif
+ {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(nskb);
+ }
}
- return 0;
+ if (!ss->commit || !ss->abort) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(skb);
+ }
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen, type;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* Only requests are handled by the kernel */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ type = nlh->nlmsg_type;
+ if (type == NFNL_MSG_BATCH_BEGIN) {
+ /* Malformed: Batch begin twice */
+ success = false;
+ goto done;
+ } else if (type == NFNL_MSG_BATCH_END) {
+ done = true;
+ goto done;
+ } else if (type < NLMSG_MIN_TYPE) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* We only accept a batch with messages for the same
+ * subsystem.
+ */
+ if (NFNL_SUBSYS_ID(type) != subsys_id) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ {
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ int attrlen = nlh->nlmsg_len - min_len;
+
+ err = nla_parse(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen, ss->cb[cb_id].policy);
+ if (err < 0)
+ goto ack;
+
+ if (nc->call_batch) {
+ err = nc->call_batch(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ }
+
+ /* The lock was released to autoload some module, we
+ * have to abort and start from scratch using the
+ * original skb.
+ */
+ if (err == -EAGAIN) {
+ ss->abort(skb);
+ nfnl_unlock(subsys_id);
+ kfree_skb(nskb);
+ goto replay;
+ }
+ }
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err) {
+ /* We don't stop processing the batch on errors, thus,
+ * userspace gets all the errors that the batch
+ * triggers.
+ */
+ netlink_ack(skb, nlh, err);
+ if (err)
+ success = false;
+ }
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+done:
+ if (success && done)
+ ss->commit(skb);
+ else
+ ss->abort(skb);
+
+ nfnl_unlock(subsys_id);
+ kfree_skb(nskb);
}
-static void nfnetlink_rcv(struct sock *sk, int len)
+static void nfnetlink_rcv(struct sk_buff *skb)
{
- do {
- struct sk_buff *skb;
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ int msglen;
- if (nfnl_shlock_nowait())
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len)
+ return;
+
+ if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
+ netlink_ack(skb, nlh, -EPERM);
+ return;
+ }
+
+ if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
+ struct nfgenmsg *nfgenmsg;
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
return;
- while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- if (nfnetlink_rcv_skb(skb)) {
- if (skb->len)
- skb_queue_head(&sk->sk_receive_queue,
- skb);
- else
- kfree_skb(skb);
- break;
- }
- kfree_skb(skb);
- }
+ nfgenmsg = nlmsg_data(nlh);
+ skb_pull(skb, msglen);
+ nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+ } else {
+ netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+ }
+}
- /* don't call nfnl_shunlock, since it would reenter
- * with further packet processing */
- up(&nfnl_sem);
- } while(nfnl && nfnl->sk_receive_queue.qlen);
+#ifdef CONFIG_MODULES
+static int nfnetlink_bind(int group)
+{
+ const struct nfnetlink_subsystem *ss;
+ int type = nfnl_group2type[group];
+
+ rcu_read_lock();
+ ss = nfnetlink_get_subsys(type);
+ rcu_read_unlock();
+ if (!ss)
+ request_module("nfnetlink-subsys-%d", type);
+ return 0;
}
+#endif
-static void __exit nfnetlink_exit(void)
+static int __net_init nfnetlink_net_init(struct net *net)
{
- printk("Removing netfilter NETLINK layer.\n");
- sock_release(nfnl->sk_socket);
- return;
+ struct sock *nfnl;
+ struct netlink_kernel_cfg cfg = {
+ .groups = NFNLGRP_MAX,
+ .input = nfnetlink_rcv,
+#ifdef CONFIG_MODULES
+ .bind = nfnetlink_bind,
+#endif
+ };
+
+ nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
+ if (!nfnl)
+ return -ENOMEM;
+ net->nfnl_stash = nfnl;
+ rcu_assign_pointer(net->nfnl, nfnl);
+ return 0;
}
+static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ RCU_INIT_POINTER(net->nfnl, NULL);
+ synchronize_net();
+ list_for_each_entry(net, net_exit_list, exit_list)
+ netlink_kernel_release(net->nfnl_stash);
+}
+
+static struct pernet_operations nfnetlink_net_ops = {
+ .init = nfnetlink_net_init,
+ .exit_batch = nfnetlink_net_exit_batch,
+};
+
static int __init nfnetlink_init(void)
{
- printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+ int i;
- nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
- nfnetlink_rcv, THIS_MODULE);
- if (!nfnl) {
- printk(KERN_ERR "cannot initialize nfnetlink!\n");
- return -1;
- }
+ for (i=0; i<NFNL_SUBSYS_COUNT; i++)
+ mutex_init(&table[i].mutex);
- return 0;
+ pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
+ return register_pernet_subsys(&nfnetlink_net_ops);
}
+static void __exit nfnetlink_exit(void)
+{
+ pr_info("Removing netfilter NETLINK layer.\n");
+ unregister_pernet_subsys(&nfnetlink_net_ops);
+}
module_init(nfnetlink_init);
module_exit(nfnetlink_exit);
-
-EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
-EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
-EXPORT_SYMBOL_GPL(nfnetlink_send);
-EXPORT_SYMBOL_GPL(nfnetlink_unicast);
-EXPORT_SYMBOL_GPL(nfattr_parse);
-EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
new file mode 100644
index 00000000000..2baa125c2e8
--- /dev/null
+++ b/net/netfilter/nfnetlink_acct.c
@@ -0,0 +1,454 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
+
+static LIST_HEAD(nfnl_acct_list);
+
+struct nf_acct {
+ atomic64_t pkts;
+ atomic64_t bytes;
+ unsigned long flags;
+ struct list_head head;
+ atomic_t refcnt;
+ char name[NFACCT_NAME_MAX];
+ struct rcu_head rcu_head;
+ char data[0];
+};
+
+#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
+
+static int
+nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ struct nf_acct *nfacct, *matching = NULL;
+ char *acct_name;
+ unsigned int size = 0;
+ u32 flags = 0;
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+
+ acct_name = nla_data(tb[NFACCT_NAME]);
+ if (strlen(acct_name) == 0)
+ return -EINVAL;
+
+ list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+ if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = nfacct;
+ break;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* reset counters if you request a replacement. */
+ atomic64_set(&matching->pkts, 0);
+ atomic64_set(&matching->bytes, 0);
+ smp_mb__before_atomic();
+ /* reset overquota flag if quota is enabled. */
+ if ((matching->flags & NFACCT_F_QUOTA))
+ clear_bit(NFACCT_F_OVERQUOTA, &matching->flags);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ if (tb[NFACCT_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS]));
+ if (flags & ~NFACCT_F_QUOTA)
+ return -EOPNOTSUPP;
+ if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA)
+ return -EINVAL;
+ if (flags & NFACCT_F_OVERQUOTA)
+ return -EINVAL;
+
+ size += sizeof(u64);
+ }
+
+ nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL);
+ if (nfacct == NULL)
+ return -ENOMEM;
+
+ if (flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)nfacct->data;
+
+ *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA]));
+ nfacct->flags = flags;
+ }
+
+ strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+
+ if (tb[NFACCT_BYTES]) {
+ atomic64_set(&nfacct->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES])));
+ }
+ if (tb[NFACCT_PKTS]) {
+ atomic64_set(&nfacct->pkts,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
+ }
+ atomic_set(&nfacct->refcnt, 1);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+ return 0;
+}
+
+static int
+nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_acct *acct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ u64 pkts, bytes;
+
+ event |= NFNL_SUBSYS_ACCT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFACCT_NAME, acct->name))
+ goto nla_put_failure;
+
+ if (type == NFNL_MSG_ACCT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&acct->pkts, 0);
+ bytes = atomic64_xchg(&acct->bytes, 0);
+ smp_mb__before_atomic();
+ if (acct->flags & NFACCT_F_QUOTA)
+ clear_bit(NFACCT_F_OVERQUOTA, &acct->flags);
+ } else {
+ pkts = atomic64_read(&acct->pkts);
+ bytes = atomic64_read(&acct->bytes);
+ }
+ if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
+ nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
+ goto nla_put_failure;
+ if (acct->flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)acct->data;
+
+ if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) ||
+ nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+ goto nla_put_failure;
+ }
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_acct *cur, *last;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_acct *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+ if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT;
+ struct nf_acct *cur;
+ char *acct_name;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_acct_dump,
+ };
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int nfnl_acct_try_del(struct nf_acct *cur)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nfnl_acct_find_get. */
+ if (atomic_dec_and_test(&cur->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&cur->head);
+ kfree_rcu(cur, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&cur->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *acct_name;
+ struct nf_acct *cur;
+ int ret = -ENOENT;
+
+ if (!tb[NFACCT_NAME]) {
+ list_for_each_entry(cur, &nfnl_acct_list, head)
+ nfnl_acct_try_del(cur);
+
+ return 0;
+ }
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ ret = nfnl_acct_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
+ [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },
+ [NFACCT_BYTES] = { .type = NLA_U64 },
+ [NFACCT_PKTS] = { .type = NLA_U64 },
+ [NFACCT_FLAGS] = { .type = NLA_U32 },
+ [NFACCT_QUOTA] = { .type = NLA_U64 },
+};
+
+static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
+ [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_acct_subsys = {
+ .name = "acct",
+ .subsys_id = NFNL_SUBSYS_ACCT,
+ .cb_count = NFNL_MSG_ACCT_MAX,
+ .cb = nfnl_acct_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
+
+struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+{
+ struct nf_acct *cur, *acct = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&cur->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+
+ acct = cur;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return acct;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
+
+void nfnl_acct_put(struct nf_acct *acct)
+{
+ atomic_dec(&acct->refcnt);
+ module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_put);
+
+void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ atomic64_inc(&nfacct->pkts);
+ atomic64_add(skb->len, &nfacct->bytes);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_update);
+
+static void nfnl_overquota_report(struct nf_acct *nfacct)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0,
+ nfacct);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ return;
+ }
+ netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+ GFP_ATOMIC);
+}
+
+int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ u64 now;
+ u64 *quota;
+ int ret = NFACCT_UNDERQUOTA;
+
+ /* no place here if we don't have a quota */
+ if (!(nfacct->flags & NFACCT_F_QUOTA))
+ return NFACCT_NO_QUOTA;
+
+ quota = (u64 *)nfacct->data;
+ now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ?
+ atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes);
+
+ ret = now > *quota;
+
+ if (now >= *quota &&
+ !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) {
+ nfnl_overquota_report(nfacct);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+
+static int __init nfnl_acct_init(void)
+{
+ int ret;
+
+ pr_info("nfnl_acct: registering with nfnetlink.\n");
+ ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_acct_exit(void)
+{
+ struct nf_acct *cur, *tmp;
+
+ pr_info("nfnl_acct: unregistering from nfnetlink.\n");
+ nfnetlink_subsys_unregister(&nfnl_acct_subsys);
+
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt. */
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+module_init(nfnl_acct_init);
+module_exit(nfnl_acct_exit);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
new file mode 100644
index 00000000000..9e287cb56a0
--- /dev/null
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -0,0 +1,680 @@
+/*
+ * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ *
+ * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nfnetlink_cthelper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
+
+static int
+nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ const struct nf_conn_help *help;
+ struct nf_conntrack_helper *helper;
+
+ help = nfct_help(ct);
+ if (help == NULL)
+ return NF_DROP;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ return NF_DROP;
+
+ /* This is an user-space helper not yet configured, skip. */
+ if ((helper->flags &
+ (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
+ NF_CT_HELPER_F_USERSPACE)
+ return NF_ACCEPT;
+
+ /* If the user-space helper is not available, don't block traffic. */
+ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS;
+}
+
+static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = {
+ [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, },
+ [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, },
+};
+
+static int
+nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_TUPLE_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
+ return -EINVAL;
+
+ tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
+ tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
+
+ return 0;
+}
+
+static int
+nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ const struct nf_conn_help *help = nfct_help(ct);
+
+ if (attr == NULL)
+ return -EINVAL;
+
+ if (help->helper->data_len == 0)
+ return -EINVAL;
+
+ memcpy(&help->data, nla_data(attr), help->helper->data_len);
+ return 0;
+}
+
+static int
+nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ const struct nf_conn_help *help = nfct_help(ct);
+
+ if (help->helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = {
+ [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, },
+ [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_POLICY_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_POLICY_NAME] ||
+ !tb[NFCTH_POLICY_EXPECT_MAX] ||
+ !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+ return -EINVAL;
+
+ strncpy(expect_policy->name,
+ nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+ expect_policy->max_expected =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ expect_policy->timeout =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+ return 0;
+}
+
+static const struct nla_policy
+nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = {
+ [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
+ const struct nlattr *attr)
+{
+ int i, ret;
+ struct nf_conntrack_expect_policy *expect_policy;
+ struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+
+ ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[NFCTH_POLICY_SET_NUM])
+ return -EINVAL;
+
+ helper->expect_class_max =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+
+ if (helper->expect_class_max != 0 &&
+ helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+ return -EOVERFLOW;
+
+ expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
+ helper->expect_class_max, GFP_KERNEL);
+ if (expect_policy == NULL)
+ return -ENOMEM;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ if (!tb[NFCTH_POLICY_SET+i])
+ goto err;
+
+ ret = nfnl_cthelper_expect_policy(&expect_policy[i],
+ tb[NFCTH_POLICY_SET+i]);
+ if (ret < 0)
+ goto err;
+ }
+ helper->expect_policy = expect_policy;
+ return 0;
+err:
+ kfree(expect_policy);
+ return -EINVAL;
+}
+
+static int
+nfnl_cthelper_create(const struct nlattr * const tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_helper *helper;
+ int ret;
+
+ if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
+ return -EINVAL;
+
+ helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
+ if (helper == NULL)
+ return -ENOMEM;
+
+ ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
+ if (ret < 0)
+ goto err;
+
+ strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+ helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ helper->flags |= NF_CT_HELPER_F_USERSPACE;
+ memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
+
+ helper->me = THIS_MODULE;
+ helper->help = nfnl_userspace_cthelper;
+ helper->from_nlattr = nfnl_cthelper_from_nlattr;
+ helper->to_nlattr = nfnl_cthelper_to_nlattr;
+
+ /* Default to queue number zero, this can be updated at any time. */
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+
+ ret = nf_conntrack_helper_register(helper);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(helper);
+ return ret;
+}
+
+static int
+nfnl_cthelper_update(const struct nlattr * const tb[],
+ struct nf_conntrack_helper *helper)
+{
+ int ret;
+
+ if (tb[NFCTH_PRIV_DATA_LEN])
+ return -EBUSY;
+
+ if (tb[NFCTH_POLICY]) {
+ ret = nfnl_cthelper_parse_expect_policy(helper,
+ tb[NFCTH_POLICY]);
+ if (ret < 0)
+ return ret;
+ }
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+ return 0;
+}
+
+static int
+nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ const char *helper_name;
+ struct nf_conntrack_helper *cur, *helper = NULL;
+ struct nf_conntrack_tuple tuple;
+ int ret = 0, i;
+
+ if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
+ return -EINVAL;
+
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ rcu_read_lock();
+ for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0)
+ continue;
+
+ if ((tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ ret = -EEXIST;
+ goto err;
+ }
+ helper = cur;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (helper == NULL)
+ ret = nfnl_cthelper_create(tb, &tuple);
+ else
+ ret = nfnl_cthelper_update(tb, helper);
+
+ return ret;
+err:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int
+nfnl_cthelper_dump_tuple(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED);
+ if (nest_parms == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM,
+ htons(helper->tuple.src.l3num)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_policy(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ int i;
+ struct nlattr *nest_parms1, *nest_parms2;
+
+ nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED);
+ if (nest_parms1 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
+ htonl(helper->expect_class_max)))
+ goto nla_put_failure;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ nest_parms2 = nla_nest_start(skb,
+ (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
+ if (nest_parms2 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFCTH_POLICY_NAME,
+ helper->expect_policy[i].name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX,
+ htonl(helper->expect_policy[i].max_expected)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT,
+ htonl(helper->expect_policy[i].timeout)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms2);
+ }
+ nla_nest_end(skb, nest_parms1);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_conntrack_helper *helper)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ int status;
+
+ event |= NFNL_SUBSYS_CTHELPER << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFCTH_NAME, helper->name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num)))
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_tuple(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_policy(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len)))
+ goto nla_put_failure;
+
+ if (helper->flags & NF_CT_HELPER_F_CONFIGURED)
+ status = NFCT_HELPER_STATUS_ENABLED;
+ else
+ status = NFCT_HELPER_STATUS_DISABLED;
+
+ if (nla_put_be32(skb, NFCTH_STATUS, htonl(status)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_helper *cur, *last;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_helper *)cb->args[1];
+ for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) {
+restart:
+ hlist_for_each_entry_rcu(cur,
+ &nf_ct_helper_hash[cb->args[0]], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (cb->args[1]) {
+ if (cur != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (nfnl_cthelper_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ goto out;
+ }
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+out:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT, i;
+ struct nf_conntrack_helper *cur;
+ struct sk_buff *skb2;
+ char *helper_name = NULL;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_cthelper_dump_table,
+ };
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ }
+ return ret;
+}
+
+static int
+nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *helper_name = NULL;
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false, found = false;
+ int i, j = 0, ret;
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ j++;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+ /* Make sure we return success if we flush and there is no helpers */
+ return (found || j == 0) ? 0 : -ENOENT;
+}
+
+static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
+ [NFCTH_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+};
+
+static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
+ [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
+ .name = "cthelper",
+ .subsys_id = NFNL_SUBSYS_CTHELPER,
+ .cb_count = NFNL_MSG_CTHELPER_MAX,
+ .cb = nfnl_cthelper_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER);
+
+static int __init nfnl_cthelper_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_cthelper: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_cthelper_exit(void)
+{
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ int i;
+
+ nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
+
+ for (i=0; i<nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+}
+
+module_init(nfnl_cthelper_init);
+module_exit(nfnl_cthelper_exit);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
new file mode 100644
index 00000000000..476accd1714
--- /dev/null
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -0,0 +1,585 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
+
+static LIST_HEAD(cttimeout_list);
+
+static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
+ [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING,
+ .len = CTNL_TIMEOUT_NAME_MAX - 1},
+ [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 },
+ [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 },
+ [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
+};
+
+static int
+ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
+ struct net *net, const struct nlattr *attr)
+{
+ int ret = 0;
+
+ if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) {
+ struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1];
+
+ ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
+ attr, l4proto->ctnl_timeout.nla_policy);
+ if (ret < 0)
+ return ret;
+
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+ }
+ return ret;
+}
+
+static int
+cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct ctnl_timeout *timeout, *matching = NULL;
+ struct net *net = sock_net(skb->sk);
+ char *name;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_NAME] ||
+ !cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+
+ list_for_each_entry(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = timeout;
+ break;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supportted, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err_proto_put;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* You cannot replace one timeout policy by another of
+ * different kind, sorry.
+ */
+ if (matching->l3num != l3num ||
+ matching->l4proto->l4proto != l4num) {
+ ret = -EINVAL;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&matching->data,
+ l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ return ret;
+ }
+ ret = -EBUSY;
+ goto err_proto_put;
+ }
+
+ timeout = kzalloc(sizeof(struct ctnl_timeout) +
+ l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
+ if (timeout == NULL) {
+ ret = -ENOMEM;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
+ timeout->l3num = l3num;
+ timeout->l4proto = l4proto;
+ atomic_set(&timeout->refcnt, 1);
+ list_add_tail_rcu(&timeout->head, &cttimeout_list);
+
+ return 0;
+err:
+ kfree(timeout);
+err_proto_put:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct ctnl_timeout *timeout)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
+ nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+ nla_put_be32(skb, CTA_TIMEOUT_USE,
+ htonl(atomic_read(&timeout->refcnt))))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ctnl_timeout *cur, *last;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct ctnl_timeout *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &cttimeout_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+ if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int ret = -ENOENT;
+ char *name;
+ struct ctnl_timeout *cur;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnl_timeout_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ if (!cda[CTA_TIMEOUT_NAME])
+ return -EINVAL;
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nf_ct_timeout_find_get. */
+ if (atomic_dec_and_test(&timeout->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&timeout->head);
+ nf_ct_l4proto_put(timeout->l4proto);
+ kfree_rcu(timeout, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&timeout->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ char *name;
+ struct ctnl_timeout *cur;
+ int ret = -ENOENT;
+
+ if (!cda[CTA_TIMEOUT_NAME]) {
+ list_for_each_entry(cur, &cttimeout_list, head)
+ ctnl_timeout_try_del(cur);
+
+ return 0;
+ }
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ ret = ctnl_timeout_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static int
+cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ unsigned int *timeouts;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ timeouts = l4proto->get_timeouts(net);
+
+ ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ nf_ct_l4proto_put(l4proto);
+ return 0;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
+ u32 seq, u32 type, int event,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ unsigned int *timeouts = l4proto->get_timeouts(net);
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *skb2;
+ int ret, err;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+ l4proto);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ err = -ENOMEM;
+ goto err;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return err;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)
+{
+ struct ctnl_timeout *timeout, *matching = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&timeout->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+ matching = timeout;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return matching;
+}
+
+static void ctnl_timeout_put(struct ctnl_timeout *timeout)
+{
+ atomic_dec(&timeout->refcnt);
+ module_put(THIS_MODULE);
+}
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+
+static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
+ [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+};
+
+static const struct nfnetlink_subsystem cttimeout_subsys = {
+ .name = "conntrack_timeout",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT,
+ .cb_count = IPCTNL_MSG_TIMEOUT_MAX,
+ .cb = cttimeout_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
+
+static int __init cttimeout_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&cttimeout_subsys);
+ if (ret < 0) {
+ pr_err("cttimeout_init: cannot register cttimeout with "
+ "nfnetlink.\n");
+ goto err_out;
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+ return 0;
+
+err_out:
+ return ret;
+}
+
+static void __exit cttimeout_exit(void)
+{
+ struct ctnl_timeout *cur, *tmp;
+
+ pr_info("cttimeout: unregistering from nfnetlink.\n");
+
+ nfnetlink_subsys_unregister(&cttimeout_subsys);
+ list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt.
+ */
+ nf_ct_l4proto_put(cur->l4proto);
+ kfree_rcu(cur, rcu_head);
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+}
+
+module_init(cttimeout_init);
+module_exit(cttimeout_exit);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3b3c781b40c..d292c8d286e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -3,6 +3,7 @@
* nfetlink.
*
* (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the old ipv4-only ipt_ULOG.c:
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
@@ -10,16 +11,16 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
#include <linux/spinlock.h>
@@ -27,11 +28,13 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
+#include <linux/slab.h>
#include <net/sock.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nfnetlink_log.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#ifdef CONFIG_BRIDGE_NETFILTER
#include "../bridge/br_private.h"
@@ -40,18 +43,11 @@
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
+#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */
#define PRINTR(x, args...) do { if (net_ratelimit()) \
printk(x, ## args); } while (0);
-#if 0
-#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
- __FILE__, __LINE__, __FUNCTION__, \
- ## args)
-#else
-#define UDEBUG(x, ...)
-#endif
-
struct nfulnl_instance {
struct hlist_node hlist; /* global list of instances */
spinlock_t lock;
@@ -59,24 +55,37 @@ struct nfulnl_instance {
unsigned int qlen; /* number of nlmsgs in skb */
struct sk_buff *skb; /* pre-allocatd skb */
- struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
struct timer_list timer;
- int peer_pid; /* PID of the peer process */
+ struct net *net;
+ struct user_namespace *peer_user_ns; /* User namespace of the peer process */
+ int peer_portid; /* PORTID of the peer process */
/* configurable parameters */
unsigned int flushtimeout; /* timeout until queue flush */
unsigned int nlbufsiz; /* netlink buffer allocation size */
unsigned int qthreshold; /* threshold of the queue */
u_int32_t copy_range;
+ u_int32_t seq; /* instance-local sequential counter */
u_int16_t group_num; /* number of this queue */
- u_int8_t copy_mode;
+ u_int16_t flags;
+ u_int8_t copy_mode;
+ struct rcu_head rcu;
};
-static DEFINE_RWLOCK(instances_lock);
-
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
-static unsigned int hash_init;
+
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+ atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_log_net_id);
+}
static inline u_int8_t instance_hashfn(u_int16_t group_num)
{
@@ -84,16 +93,13 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)
}
static struct nfulnl_instance *
-__instance_lookup(u_int16_t group_num)
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
{
struct hlist_head *head;
- struct hlist_node *pos;
struct nfulnl_instance *inst;
- UDEBUG("entering (group_num=%u)\n", group_num);
-
- head = &instance_table[instance_hashfn(group_num)];
- hlist_for_each_entry(inst, pos, head, hlist) {
+ head = &log->instance_table[instance_hashfn(group_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->group_num == group_num)
return inst;
}
@@ -107,134 +113,126 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(u_int16_t group_num)
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
{
struct nfulnl_instance *inst;
- read_lock_bh(&instances_lock);
- inst = __instance_lookup(group_num);
- if (inst)
- instance_get(inst);
- read_unlock_bh(&instances_lock);
+ rcu_read_lock_bh();
+ inst = __instance_lookup(log, group_num);
+ if (inst && !atomic_inc_not_zero(&inst->use))
+ inst = NULL;
+ rcu_read_unlock_bh();
return inst;
}
+static void nfulnl_instance_free_rcu(struct rcu_head *head)
+{
+ struct nfulnl_instance *inst =
+ container_of(head, struct nfulnl_instance, rcu);
+
+ put_net(inst->net);
+ kfree(inst);
+ module_put(THIS_MODULE);
+}
+
static void
instance_put(struct nfulnl_instance *inst)
{
- if (inst && atomic_dec_and_test(&inst->use)) {
- UDEBUG("kfree(inst=%p)\n", inst);
- kfree(inst);
- }
+ if (inst && atomic_dec_and_test(&inst->use))
+ call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
}
static void nfulnl_timer(unsigned long data);
static struct nfulnl_instance *
-instance_create(u_int16_t group_num, int pid)
+instance_create(struct net *net, u_int16_t group_num,
+ int portid, struct user_namespace *user_ns)
{
struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+ int err;
- UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
- pid);
-
- write_lock_bh(&instances_lock);
- if (__instance_lookup(group_num)) {
- inst = NULL;
- UDEBUG("aborting, instance already exists\n");
+ spin_lock_bh(&log->instances_lock);
+ if (__instance_lookup(log, group_num)) {
+ err = -EEXIST;
goto out_unlock;
}
inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
- if (!inst)
+ if (!inst) {
+ err = -ENOMEM;
goto out_unlock;
+ }
+
+ if (!try_module_get(THIS_MODULE)) {
+ kfree(inst);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
INIT_HLIST_NODE(&inst->hlist);
spin_lock_init(&inst->lock);
/* needs to be two, since we _put() after creation */
atomic_set(&inst->use, 2);
- init_timer(&inst->timer);
- inst->timer.function = nfulnl_timer;
- inst->timer.data = (unsigned long)inst;
- /* don't start timer yet. (re)start it with every packet */
+ setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
- inst->peer_pid = pid;
+ inst->net = get_net(net);
+ inst->peer_user_ns = user_ns;
+ inst->peer_portid = portid;
inst->group_num = group_num;
inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT;
inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT;
inst->copy_mode = NFULNL_COPY_PACKET;
- inst->copy_range = 0xffff;
-
- if (!try_module_get(THIS_MODULE))
- goto out_free;
+ inst->copy_range = NFULNL_COPY_RANGE_MAX;
- hlist_add_head(&inst->hlist,
- &instance_table[instance_hashfn(group_num)]);
+ hlist_add_head_rcu(&inst->hlist,
+ &log->instance_table[instance_hashfn(group_num)]);
- UDEBUG("newly added node: %p, next=%p\n", &inst->hlist,
- inst->hlist.next);
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return inst;
-out_free:
- instance_put(inst);
out_unlock:
- write_unlock_bh(&instances_lock);
- return NULL;
+ spin_unlock_bh(&log->instances_lock);
+ return ERR_PTR(err);
}
-static int __nfulnl_send(struct nfulnl_instance *inst);
+static void __nfulnl_flush(struct nfulnl_instance *inst);
+/* called with BH disabled */
static void
-_instance_destroy2(struct nfulnl_instance *inst, int lock)
+__instance_destroy(struct nfulnl_instance *inst)
{
/* first pull it out of the global list */
- if (lock)
- write_lock_bh(&instances_lock);
+ hlist_del_rcu(&inst->hlist);
- UDEBUG("removing instance %p (queuenum=%u) from hash\n",
- inst, inst->group_num);
+ /* then flush all pending packets from skb */
- hlist_del(&inst->hlist);
+ spin_lock(&inst->lock);
- if (lock)
- write_unlock_bh(&instances_lock);
+ /* lockless readers wont be able to use us */
+ inst->copy_mode = NFULNL_COPY_DISABLED;
- /* then flush all pending packets from skb */
-
- spin_lock_bh(&inst->lock);
- if (inst->skb) {
- if (inst->qlen)
- __nfulnl_send(inst);
- if (inst->skb) {
- kfree_skb(inst->skb);
- inst->skb = NULL;
- }
- }
- spin_unlock_bh(&inst->lock);
+ if (inst->skb)
+ __nfulnl_flush(inst);
+ spin_unlock(&inst->lock);
/* and finally put the refcount */
instance_put(inst);
-
- module_put(THIS_MODULE);
-}
-
-static inline void
-__instance_destroy(struct nfulnl_instance *inst)
-{
- _instance_destroy2(inst, 0);
}
static inline void
-instance_destroy(struct nfulnl_instance *inst)
+instance_destroy(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst)
{
- _instance_destroy2(inst, 1);
+ spin_lock_bh(&log->instances_lock);
+ __instance_destroy(inst);
+ spin_unlock_bh(&log->instances_lock);
}
static int
@@ -244,23 +242,20 @@ nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
int status = 0;
spin_lock_bh(&inst->lock);
-
+
switch (mode) {
case NFULNL_COPY_NONE:
case NFULNL_COPY_META:
inst->copy_mode = mode;
inst->copy_range = 0;
break;
-
+
case NFULNL_COPY_PACKET:
inst->copy_mode = mode;
- /* we're using struct nfattr which has 16bit nfa_len */
- if (range > 0xffff)
- inst->copy_range = 0xffff;
- else
- inst->copy_range = range;
+ inst->copy_range = min_t(unsigned int,
+ range, NFULNL_COPY_RANGE_MAX);
break;
-
+
default:
status = -EINVAL;
break;
@@ -310,31 +305,38 @@ nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
return 0;
}
-static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size,
- unsigned int pkt_size)
+static int
+nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
+{
+ spin_lock_bh(&inst->lock);
+ inst->flags = flags;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static struct sk_buff *
+nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
+ unsigned int pkt_size)
{
struct sk_buff *skb;
unsigned int n;
- UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
-
/* alloc skb which should be big enough for a whole multipart
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
if (!skb) {
- PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
- inst_size);
-
if (n > pkt_size) {
/* try to allocate only as much as we need for current
* packet */
- skb = alloc_skb(pkt_size, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(net, pkt_size,
+ peer_portid, GFP_ATOMIC);
if (!skb)
- PRINTR("nfnetlink_log: can't even alloc %u "
- "bytes\n", pkt_size);
+ pr_err("nfnetlink_log: can't even alloc %u bytes\n",
+ pkt_size);
}
}
@@ -344,200 +346,245 @@ static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size,
static int
__nfulnl_send(struct nfulnl_instance *inst)
{
- int status;
-
- if (timer_pending(&inst->timer))
- del_timer(&inst->timer);
-
- if (inst->qlen > 1)
- inst->lastnlh->nlmsg_type = NLMSG_DONE;
-
- status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- UDEBUG("netlink_unicast() failed\n");
- /* FIXME: statistics */
+ int status = -1;
+
+ if (inst->qlen > 1) {
+ struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
+ NLMSG_DONE,
+ sizeof(struct nfgenmsg),
+ 0);
+ if (!nlh)
+ goto out;
}
+ status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
+ MSG_DONTWAIT);
inst->qlen = 0;
inst->skb = NULL;
- inst->lastnlh = NULL;
-
+out:
return status;
}
-static void nfulnl_timer(unsigned long data)
+static void
+__nfulnl_flush(struct nfulnl_instance *inst)
{
- struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+ /* timer holds a reference */
+ if (del_timer(&inst->timer))
+ instance_put(inst);
+ if (inst->skb)
+ __nfulnl_send(inst);
+}
- UDEBUG("timer function called, flushing buffer\n");
+static void
+nfulnl_timer(unsigned long data)
+{
+ struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
spin_lock_bh(&inst->lock);
- __nfulnl_send(inst);
- instance_put(inst);
+ if (inst->skb)
+ __nfulnl_send(inst);
spin_unlock_bh(&inst->lock);
+ instance_put(inst);
}
-static inline int
-__build_packet_message(struct nfulnl_instance *inst,
- const struct sk_buff *skb,
+/* This is an inline function, we don't really care about a long
+ * list of arguments */
+static inline int
+__build_packet_message(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst,
+ const struct sk_buff *skb,
unsigned int data_len,
- unsigned int pf,
+ u_int8_t pf,
unsigned int hooknum,
const struct net_device *indev,
const struct net_device *outdev,
- const struct nf_loginfo *li,
- const char *prefix)
+ const char *prefix, unsigned int plen)
{
- unsigned char *old_tail;
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
- u_int32_t tmp_uint;
+ sk_buff_data_t old_tail = inst->skb->tail;
+ struct sock *sk;
+ const unsigned char *hwhdrp;
- UDEBUG("entered\n");
-
- old_tail = inst->skb->tail;
- nlh = NLMSG_PUT(inst->skb, 0, 0,
+ nlh = nlmsg_put(inst->skb, 0, 0,
NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
- sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh)
+ return -1;
+ nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = pf;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(inst->group_num);
- pmsg.hw_protocol = htons(skb->protocol);
+ memset(&pmsg, 0, sizeof(pmsg));
+ pmsg.hw_protocol = skb->protocol;
pmsg.hook = hooknum;
- NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+ if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg))
+ goto nla_put_failure;
- if (prefix) {
- int slen = strlen(prefix);
- if (slen > NFULNL_PREFIXLEN)
- slen = NFULNL_PREFIXLEN;
- NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
- }
+ if (prefix &&
+ nla_put(inst->skb, NFULA_PREFIX, plen, prefix))
+ goto nla_put_failure;
if (indev) {
- tmp_uint = htonl(indev->ifindex);
#ifndef CONFIG_BRIDGE_NETFILTER
- NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
- &tmp_uint);
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
#else
if (pf == PF_BRIDGE) {
/* Case 1: outdev is physical input device, we need to
* look for bridge group (when called from
* netfilter_bridge) */
- NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- sizeof(tmp_uint), &tmp_uint);
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
/* this is the bridge group "brX" */
- tmp_uint = htonl(indev->br_port->br->dev->ifindex);
- NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
- sizeof(tmp_uint), &tmp_uint);
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
} else {
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
- NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
- sizeof(tmp_uint), &tmp_uint);
- if (skb->nf_bridge && skb->nf_bridge->physindev) {
- tmp_uint =
- htonl(skb->nf_bridge->physindev->ifindex);
- NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- sizeof(tmp_uint), &tmp_uint);
- }
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+ if (skb->nf_bridge && skb->nf_bridge->physindev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(skb->nf_bridge->physindev->ifindex)))
+ goto nla_put_failure;
}
#endif
}
if (outdev) {
- tmp_uint = htonl(outdev->ifindex);
#ifndef CONFIG_BRIDGE_NETFILTER
- NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
- &tmp_uint);
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
#else
if (pf == PF_BRIDGE) {
/* Case 1: outdev is physical output device, we need to
* look for bridge group (when called from
* netfilter_bridge) */
- NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
- sizeof(tmp_uint), &tmp_uint);
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
/* this is the bridge group "brX" */
- tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
- NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
- sizeof(tmp_uint), &tmp_uint);
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
} else {
/* Case 2: indev is a bridge group, we need to look
* for physical device (when called from ipv4) */
- NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
- sizeof(tmp_uint), &tmp_uint);
- if (skb->nf_bridge) {
- tmp_uint =
- htonl(skb->nf_bridge->physoutdev->ifindex);
- NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
- sizeof(tmp_uint), &tmp_uint);
- }
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+ if (skb->nf_bridge && skb->nf_bridge->physoutdev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(skb->nf_bridge->physoutdev->ifindex)))
+ goto nla_put_failure;
}
#endif
}
- if (skb->nfmark) {
- tmp_uint = htonl(skb->nfmark);
- NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
- }
+ if (skb->mark &&
+ nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark)))
+ goto nla_put_failure;
- if (indev && skb->dev && skb->dev->hard_header_parse) {
+ if (indev && skb->dev &&
+ skb->mac_header != skb->network_header) {
struct nfulnl_msg_packet_hw phw;
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(skb, phw.hw_addr);
+ if (len > 0) {
+ phw.hw_addrlen = htons(len);
+ if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
+ }
+ }
+
+ if (indev && skb_mac_header_was_set(skb)) {
+ if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) ||
+ nla_put_be16(inst->skb, NFULA_HWLEN,
+ htons(skb->dev->hard_header_len)))
+ goto nla_put_failure;
- phw.hw_addrlen =
- skb->dev->hard_header_parse((struct sk_buff *)skb,
- phw.hw_addr);
- phw.hw_addrlen = htons(phw.hw_addrlen);
- NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+ hwhdrp = skb_mac_header(skb);
+
+ if (skb->dev->type == ARPHRD_SIT)
+ hwhdrp -= ETH_HLEN;
+
+ if (hwhdrp >= skb->head &&
+ nla_put(inst->skb, NFULA_HWHEADER,
+ skb->dev->hard_header_len, hwhdrp))
+ goto nla_put_failure;
}
- if (skb->tstamp.off_sec) {
+ if (skb->tstamp.tv64) {
struct nfulnl_msg_packet_timestamp ts;
+ struct timeval tv = ktime_to_timeval(skb->tstamp);
+ ts.sec = cpu_to_be64(tv.tv_sec);
+ ts.usec = cpu_to_be64(tv.tv_usec);
- ts.sec = cpu_to_be64(skb->tstamp.off_sec);
- ts.usec = cpu_to_be64(skb->tstamp.off_usec);
-
- NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+ if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
}
/* UID */
- if (skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
- u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
- /* need to unlock here since NFA_PUT may goto */
- read_unlock_bh(&skb->sk->sk_callback_lock);
- NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
+ sk = skb->sk;
+ if (sk && sk->sk_state != TCP_TIME_WAIT) {
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ struct file *file = sk->sk_socket->file;
+ const struct cred *cred = file->f_cred;
+ struct user_namespace *user_ns = inst->peer_user_ns;
+ __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
+ __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
+ nla_put_be32(inst->skb, NFULA_GID, gid))
+ goto nla_put_failure;
} else
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
+ /* local sequence number */
+ if ((inst->flags & NFULNL_CFG_F_SEQ) &&
+ nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++)))
+ goto nla_put_failure;
+
+ /* global sequence number */
+ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
+ nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
+ htonl(atomic_inc_return(&log->global_seq))))
+ goto nla_put_failure;
+
if (data_len) {
- struct nfattr *nfa;
- int size = NFA_LENGTH(data_len);
+ struct nlattr *nla;
+ int size = nla_attr_size(data_len);
- if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
+ if (skb_tailroom(inst->skb) < nla_total_size(data_len)) {
printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
- goto nlmsg_failure;
+ return -1;
}
- nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
- nfa->nfa_type = NFULA_PAYLOAD;
- nfa->nfa_len = size;
+ nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len));
+ nla->nla_type = NFULA_PAYLOAD;
+ nla->nla_len = size;
- if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
+ if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
BUG();
}
-
+
nlh->nlmsg_len = inst->skb->tail - old_tail;
return 0;
-nlmsg_failure:
- UDEBUG("nlmsg_failure\n");
-nfattr_failure:
+nla_put_failure:
PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
return -1;
}
@@ -556,8 +603,9 @@ static struct nf_loginfo default_loginfo = {
};
/* log handler for internal netfilter logging api */
-static void
-nfulnl_log_packet(unsigned int pf,
+void
+nfulnl_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -569,146 +617,144 @@ nfulnl_log_packet(unsigned int pf,
struct nfulnl_instance *inst;
const struct nf_loginfo *li;
unsigned int qthreshold;
- unsigned int nlbufsiz;
+ unsigned int plen;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
- if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
+ if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(li->u.ulog.group);
+ inst = instance_lookup_get(log, li->u.ulog.group);
if (!inst)
- inst = instance_lookup_get(0);
- if (!inst) {
- PRINTR("nfnetlink_log: trying to log packet, "
- "but no instance for group %u\n", li->u.ulog.group);
return;
- }
- /* all macros expand to constant values at compile time */
+ plen = 0;
+ if (prefix)
+ plen = strlen(prefix) + 1;
+
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
- + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
- + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
- + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
#ifdef CONFIG_BRIDGE_NETFILTER
- + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
- + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
- + NFA_SPACE(sizeof(u_int32_t)) /* mark */
- + NFA_SPACE(sizeof(u_int32_t)) /* uid */
- + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */
- + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
- + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
-
- UDEBUG("initial size=%u\n", size);
+ + nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* uid */
+ + nla_total_size(sizeof(u_int32_t)) /* gid */
+ + nla_total_size(plen) /* prefix */
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp));
+
+ if (in && skb_mac_header_was_set(skb)) {
+ size += nla_total_size(skb->dev->hard_header_len)
+ + nla_total_size(sizeof(u_int16_t)) /* hwtype */
+ + nla_total_size(sizeof(u_int16_t)); /* hwlen */
+ }
spin_lock_bh(&inst->lock);
+ if (inst->flags & NFULNL_CFG_F_SEQ)
+ size += nla_total_size(sizeof(u_int32_t));
+ if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
+ size += nla_total_size(sizeof(u_int32_t));
+
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
- if (qthreshold > li->u.ulog.qthreshold)
- qthreshold = li->u.ulog.qthreshold;
-
+ if (li->u.ulog.qthreshold)
+ if (qthreshold > li->u.ulog.qthreshold)
+ qthreshold = li->u.ulog.qthreshold;
+
+
switch (inst->copy_mode) {
case NFULNL_COPY_META:
case NFULNL_COPY_NONE:
data_len = 0;
break;
-
+
case NFULNL_COPY_PACKET:
- if (inst->copy_range == 0
+ if (inst->copy_range == 0
|| inst->copy_range > skb->len)
data_len = skb->len;
else
data_len = inst->copy_range;
-
- size += NFA_SPACE(data_len);
- UDEBUG("copy_packet, therefore size now %u\n", size);
+
+ size += nla_total_size(data_len);
break;
-
+
+ case NFULNL_COPY_DISABLED:
default:
- spin_unlock_bh(&inst->lock);
- instance_put(inst);
- return;
+ goto unlock_and_release;
}
- if (size > inst->nlbufsiz)
- nlbufsiz = size;
- else
- nlbufsiz = inst->nlbufsiz;
-
- if (!inst->skb) {
- if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
- UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
- inst->nlbufsiz, size);
- goto alloc_failure;
- }
- } else if (inst->qlen >= qthreshold ||
- size > skb_tailroom(inst->skb)) {
+ if (inst->skb &&
+ size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) {
/* either the queue len is too high or we don't have
* enough room in the skb left. flush to userspace. */
- UDEBUG("flushing old skb\n");
-
- __nfulnl_send(inst);
+ __nfulnl_flush(inst);
+ }
- if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
- UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
- inst->nlbufsiz, size);
+ if (!inst->skb) {
+ inst->skb = nfulnl_alloc_skb(net, inst->peer_portid,
+ inst->nlbufsiz, size);
+ if (!inst->skb)
goto alloc_failure;
- }
}
- UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
inst->qlen++;
- __build_packet_message(inst, skb, data_len, pf,
- hooknum, in, out, li, prefix);
+ __build_packet_message(log, inst, skb, data_len, pf,
+ hooknum, in, out, prefix, plen);
+ if (inst->qlen >= qthreshold)
+ __nfulnl_flush(inst);
/* timer_pending always called within inst->lock, so there
* is no chance of a race here */
- if (!timer_pending(&inst->timer)) {
+ else if (!timer_pending(&inst->timer)) {
instance_get(inst);
inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
add_timer(&inst->timer);
}
- spin_unlock_bh(&inst->lock);
+unlock_and_release:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
return;
alloc_failure:
- spin_unlock_bh(&inst->lock);
- instance_put(inst);
- UDEBUG("error allocating skb\n");
/* FIXME: statistics */
+ goto unlock_and_release;
}
+EXPORT_SYMBOL_GPL(nfulnl_log_packet);
static int
nfulnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_log_net *log = nfnl_log_pernet(n->net);
- if (event == NETLINK_URELEASE &&
- n->protocol == NETLINK_NETFILTER && n->pid) {
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
- /* destroy all instances for this pid */
- write_lock_bh(&instances_lock);
+ /* destroy all instances for this portid */
+ spin_lock_bh(&log->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp, *t2;
+ struct hlist_node *t2;
struct nfulnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &log->instance_table[i];
- hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
- UDEBUG("node = %p\n", inst);
- if (n->pid == inst->peer_pid)
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- write_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
return NOTIFY_DONE;
}
@@ -719,60 +765,61 @@ static struct notifier_block nfulnl_rtnl_notifier = {
static int
nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
return -ENOTSUPP;
}
-static struct nf_logger nfulnl_logger = {
+static struct nf_logger nfulnl_logger __read_mostly = {
.name = "nfnetlink_log",
.logfn = &nfulnl_log_packet,
.me = THIS_MODULE,
};
-static const int nfula_min[NFULA_MAX] = {
- [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr),
- [NFULA_MARK-1] = sizeof(u_int32_t),
- [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp),
- [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t),
- [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
- [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw),
- [NFULA_PAYLOAD-1] = 0,
- [NFULA_PREFIX-1] = 0,
- [NFULA_UID-1] = sizeof(u_int32_t),
-};
-
-static const int nfula_cfg_min[NFULA_CFG_MAX] = {
- [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd),
- [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode),
- [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t),
- [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t),
- [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t),
+static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
+ [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) },
+ [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) },
+ [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 },
+ [NFULA_CFG_QTHRESH] = { .type = NLA_U32 },
+ [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 },
+ [NFULA_CFG_FLAGS] = { .type = NLA_U16 },
};
static int
nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfula[])
{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int16_t group_num = ntohs(nfmsg->res_id);
struct nfulnl_instance *inst;
+ struct nfulnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
- UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+ if (nfula[NFULA_CFG_CMD]) {
+ u_int8_t pf = nfmsg->nfgen_family;
+ cmd = nla_data(nfula[NFULA_CFG_CMD]);
- if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
- UDEBUG("bad attribute size\n");
- return -EINVAL;
+ /* Commands without queue context */
+ switch (cmd->command) {
+ case NFULNL_CFG_CMD_PF_BIND:
+ return nf_log_bind_pf(net, pf, &nfulnl_logger);
+ case NFULNL_CFG_CMD_PF_UNBIND:
+ nf_log_unbind_pf(net, pf);
+ return 0;
+ }
}
- inst = instance_lookup_get(group_num);
- if (nfula[NFULA_CFG_CMD-1]) {
- u_int8_t pf = nfmsg->nfgen_family;
- struct nfulnl_msg_config_cmd *cmd;
- cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
- UDEBUG("found CFG_CMD for\n");
+ inst = instance_lookup_get(log, group_num);
+ if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+ if (cmd != NULL) {
switch (cmd->command) {
case NFULNL_CFG_CMD_BIND:
if (inst) {
@@ -780,98 +827,95 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
- inst = instance_create(group_num,
- NETLINK_CB(skb).pid);
- if (!inst) {
- ret = -EINVAL;
- goto out_put;
+ inst = instance_create(net, group_num,
+ NETLINK_CB(skb).portid,
+ sk_user_ns(NETLINK_CB(skb).sk));
+ if (IS_ERR(inst)) {
+ ret = PTR_ERR(inst);
+ goto out;
}
break;
case NFULNL_CFG_CMD_UNBIND:
if (!inst) {
ret = -ENODEV;
- goto out_put;
- }
-
- if (inst->peer_pid != NETLINK_CB(skb).pid) {
- ret = -EPERM;
- goto out_put;
+ goto out;
}
- instance_destroy(inst);
- break;
- case NFULNL_CFG_CMD_PF_BIND:
- UDEBUG("registering log handler for pf=%u\n", pf);
- ret = nf_log_register(pf, &nfulnl_logger);
- break;
- case NFULNL_CFG_CMD_PF_UNBIND:
- UDEBUG("unregistering log handler for pf=%u\n", pf);
- /* This is a bug and a feature. We cannot unregister
- * other handlers, like nfnetlink_inst can */
- nf_log_unregister_pf(pf);
- break;
+ instance_destroy(log, inst);
+ goto out_put;
default:
- ret = -EINVAL;
+ ret = -ENOTSUPP;
break;
}
- } else {
- if (!inst) {
- UDEBUG("no config command, and no instance for "
- "group=%u pid=%u =>ENOENT\n",
- group_num, NETLINK_CB(skb).pid);
- ret = -ENOENT;
- goto out_put;
- }
-
- if (inst->peer_pid != NETLINK_CB(skb).pid) {
- UDEBUG("no config command, and wrong pid\n");
- ret = -EPERM;
- goto out_put;
- }
}
- if (nfula[NFULA_CFG_MODE-1]) {
+ if (nfula[NFULA_CFG_MODE]) {
struct nfulnl_msg_config_mode *params;
- params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
+ params = nla_data(nfula[NFULA_CFG_MODE]);
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
nfulnl_set_mode(inst, params->copy_mode,
- ntohs(params->copy_range));
+ ntohl(params->copy_range));
}
- if (nfula[NFULA_CFG_TIMEOUT-1]) {
- u_int32_t timeout =
- *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
+ if (nfula[NFULA_CFG_TIMEOUT]) {
+ __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
nfulnl_set_timeout(inst, ntohl(timeout));
}
- if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
- u_int32_t nlbufsiz =
- *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
+ if (nfula[NFULA_CFG_NLBUFSIZ]) {
+ __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
}
- if (nfula[NFULA_CFG_QTHRESH-1]) {
- u_int32_t qthresh =
- *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
+ if (nfula[NFULA_CFG_QTHRESH]) {
+ __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
nfulnl_set_qthresh(inst, ntohl(qthresh));
}
+ if (nfula[NFULA_CFG_FLAGS]) {
+ __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_flags(inst, ntohs(flags));
+ }
+
out_put:
instance_put(inst);
+out:
return ret;
}
-static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
[NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
.attr_count = NFULA_MAX, },
[NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX, },
+ .attr_count = NFULA_CFG_MAX,
+ .policy = nfula_cfg_policy },
};
-static struct nfnetlink_subsystem nfulnl_subsys = {
+static const struct nfnetlink_subsystem nfulnl_subsys = {
.name = "log",
.subsys_id = NFNL_SUBSYS_ULOG,
.cb_count = NFULNL_MSG_MAX,
@@ -880,77 +924,88 @@ static struct nfnetlink_subsystem nfulnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
-static struct hlist_node *get_first(struct seq_file *seq)
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
{
- struct iter_state *st = seq->private;
-
+ struct nfnl_log_net *log;
if (!st)
return NULL;
+ log = nfnl_log_pernet(net);
+
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
+ struct hlist_head *head = &log->instance_table[st->bucket];
+
+ if (!hlist_empty(head))
+ return rcu_dereference_bh(hlist_first_rcu(head));
}
return NULL;
}
-static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+ struct hlist_node *h)
{
- struct iter_state *st = seq->private;
-
- h = h->next;
+ h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
+ struct nfnl_log_net *log;
+ struct hlist_head *head;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = instance_table[st->bucket].first;
+ log = nfnl_log_pernet(net);
+ head = &log->instance_table[st->bucket];
+ h = rcu_dereference_bh(hlist_first_rcu(head));
}
return h;
}
-static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+ loff_t pos)
{
struct hlist_node *head;
- head = get_first(seq);
+ head = get_first(net, st);
if (head)
- while (pos && (head = get_next(seq, head)))
+ while (pos && (head = get_next(net, st, head)))
pos--;
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(rcu_bh)
{
- read_lock_bh(&instances_lock);
- return get_idx(seq, *pos);
+ rcu_read_lock_bh();
+ return get_idx(seq_file_net(s), s->private, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
(*pos)++;
- return get_next(s, v);
+ return get_next(seq_file_net(s), s->private, v);
}
static void seq_stop(struct seq_file *s, void *v)
+ __releases(rcu_bh)
{
- read_unlock_bh(&instances_lock);
+ rcu_read_unlock_bh();
}
static int seq_show(struct seq_file *s, void *v)
{
const struct nfulnl_instance *inst = v;
- return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
+ return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
inst->group_num,
- inst->peer_pid, inst->qlen,
+ inst->peer_portid, inst->qlen,
inst->copy_mode, inst->copy_range,
inst->flushtimeout, atomic_read(&inst->use));
}
-static struct seq_operations nful_seq_ops = {
+static const struct seq_operations nful_seq_ops = {
.start = seq_start,
.next = seq_next,
.stop = seq_stop,
@@ -959,91 +1014,91 @@ static struct seq_operations nful_seq_ops = {
static int nful_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- struct iter_state *is;
- int ret;
-
- is = kzalloc(sizeof(*is), GFP_KERNEL);
- if (!is)
- return -ENOMEM;
- ret = seq_open(file, &nful_seq_ops);
- if (ret < 0)
- goto out_free;
- seq = file->private_data;
- seq->private = is;
- return ret;
-out_free:
- kfree(is);
- return ret;
+ return seq_open_net(inode, file, &nful_seq_ops,
+ sizeof(struct iter_state));
}
-static struct file_operations nful_file_ops = {
+static const struct file_operations nful_file_ops = {
.owner = THIS_MODULE,
.open = nful_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int
-init_or_cleanup(int init)
+static int __net_init nfnl_log_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&log->instance_table[i]);
+ spin_lock_init(&log->instances_lock);
+
#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *proc_nful;
+ if (!proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops))
+ return -ENOMEM;
#endif
-
- if (!init)
- goto cleanup;
+ return 0;
+}
- for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
-
- /* it's not really all that important to have a random value, so
- * we can do this from the init function, even if there hasn't
- * been that much entropy yet */
- get_random_bytes(&hash_init, sizeof(hash_init));
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+#endif
+ nf_log_unset(net, &nfulnl_logger);
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+ .init = nfnl_log_net_init,
+ .exit = nfnl_log_net_exit,
+ .id = &nfnl_log_net_id,
+ .size = sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+ int status = -ENOMEM;
netlink_register_notifier(&nfulnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfulnl_subsys);
if (status < 0) {
- printk(KERN_ERR "log: failed to create netlink socket\n");
+ pr_err("log: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
-#ifdef CONFIG_PROC_FS
- proc_nful = create_proc_entry("nfnetlink_log", 0440,
- proc_net_netfilter);
- if (!proc_nful)
+ status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
+ if (status < 0) {
+ pr_err("log: failed to register logger\n");
goto cleanup_subsys;
- proc_nful->proc_fops = &nful_file_ops;
-#endif
+ }
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("log: failed to register pernet ops\n");
+ goto cleanup_logger;
+ }
return status;
-cleanup:
- nf_log_unregister_logger(&nfulnl_logger);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+cleanup_logger:
+ nf_log_unregister(&nfulnl_logger);
cleanup_subsys:
-#endif
nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
return status;
}
-static int __init init(void)
-{
-
- return init_or_cleanup(1);
-}
-
-static void __exit fini(void)
+static void __exit nfnetlink_log_fini(void)
{
- init_or_cleanup(0);
+ unregister_pernet_subsys(&nfnl_log_net_ops);
+ nf_log_unregister(&nfulnl_logger);
+ nfnetlink_subsys_unregister(&nfulnl_subsys);
+ netlink_unregister_notifier(&nfulnl_rtnl_notifier);
}
MODULE_DESCRIPTION("netfilter userspace logging");
@@ -1051,5 +1106,5 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
-module_init(init);
-module_exit(fini);
+module_init(nfnetlink_log_init);
+module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
deleted file mode 100644
index cac38b2e147..00000000000
--- a/net/netfilter/nfnetlink_queue.c
+++ /dev/null
@@ -1,1132 +0,0 @@
-/*
- * This is a module which is used for queueing packets and communicating with
- * userspace via nfetlink.
- *
- * (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * Based on the old ipv4-only ip_queue.c:
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/proc_fs.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_queue.h>
-#include <linux/list.h>
-#include <net/sock.h>
-
-#include <asm/atomic.h>
-
-#ifdef CONFIG_BRIDGE_NETFILTER
-#include "../bridge/br_private.h"
-#endif
-
-#define NFQNL_QMAX_DEFAULT 1024
-
-#if 0
-#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
- __FILE__, __LINE__, __FUNCTION__, \
- ## args)
-#else
-#define QDEBUG(x, ...)
-#endif
-
-struct nfqnl_queue_entry {
- struct list_head list;
- struct nf_info *info;
- struct sk_buff *skb;
- unsigned int id;
-};
-
-struct nfqnl_instance {
- struct hlist_node hlist; /* global list of queues */
- atomic_t use;
-
- int peer_pid;
- unsigned int queue_maxlen;
- unsigned int copy_range;
- unsigned int queue_total;
- unsigned int queue_dropped;
- unsigned int queue_user_dropped;
-
- atomic_t id_sequence; /* 'sequence' of pkt ids */
-
- u_int16_t queue_num; /* number of this queue */
- u_int8_t copy_mode;
-
- spinlock_t lock;
-
- struct list_head queue_list; /* packets in queue */
-};
-
-typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
-
-static DEFINE_RWLOCK(instances_lock);
-
-#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
-
-static inline u_int8_t instance_hashfn(u_int16_t queue_num)
-{
- return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
-}
-
-static struct nfqnl_instance *
-__instance_lookup(u_int16_t queue_num)
-{
- struct hlist_head *head;
- struct hlist_node *pos;
- struct nfqnl_instance *inst;
-
- head = &instance_table[instance_hashfn(queue_num)];
- hlist_for_each_entry(inst, pos, head, hlist) {
- if (inst->queue_num == queue_num)
- return inst;
- }
- return NULL;
-}
-
-static struct nfqnl_instance *
-instance_lookup_get(u_int16_t queue_num)
-{
- struct nfqnl_instance *inst;
-
- read_lock_bh(&instances_lock);
- inst = __instance_lookup(queue_num);
- if (inst)
- atomic_inc(&inst->use);
- read_unlock_bh(&instances_lock);
-
- return inst;
-}
-
-static void
-instance_put(struct nfqnl_instance *inst)
-{
- if (inst && atomic_dec_and_test(&inst->use)) {
- QDEBUG("kfree(inst=%p)\n", inst);
- kfree(inst);
- }
-}
-
-static struct nfqnl_instance *
-instance_create(u_int16_t queue_num, int pid)
-{
- struct nfqnl_instance *inst;
-
- QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
-
- write_lock_bh(&instances_lock);
- if (__instance_lookup(queue_num)) {
- inst = NULL;
- QDEBUG("aborting, instance already exists\n");
- goto out_unlock;
- }
-
- inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
- if (!inst)
- goto out_unlock;
-
- inst->queue_num = queue_num;
- inst->peer_pid = pid;
- inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
- inst->copy_range = 0xfffff;
- inst->copy_mode = NFQNL_COPY_NONE;
- atomic_set(&inst->id_sequence, 0);
- /* needs to be two, since we _put() after creation */
- atomic_set(&inst->use, 2);
- spin_lock_init(&inst->lock);
- INIT_LIST_HEAD(&inst->queue_list);
-
- if (!try_module_get(THIS_MODULE))
- goto out_free;
-
- hlist_add_head(&inst->hlist,
- &instance_table[instance_hashfn(queue_num)]);
-
- write_unlock_bh(&instances_lock);
-
- QDEBUG("successfully created new instance\n");
-
- return inst;
-
-out_free:
- kfree(inst);
-out_unlock:
- write_unlock_bh(&instances_lock);
- return NULL;
-}
-
-static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
-
-static void
-_instance_destroy2(struct nfqnl_instance *inst, int lock)
-{
- /* first pull it out of the global list */
- if (lock)
- write_lock_bh(&instances_lock);
-
- QDEBUG("removing instance %p (queuenum=%u) from hash\n",
- inst, inst->queue_num);
- hlist_del(&inst->hlist);
-
- if (lock)
- write_unlock_bh(&instances_lock);
-
- /* then flush all pending skbs from the queue */
- nfqnl_flush(inst, NF_DROP);
-
- /* and finally put the refcount */
- instance_put(inst);
-
- module_put(THIS_MODULE);
-}
-
-static inline void
-__instance_destroy(struct nfqnl_instance *inst)
-{
- _instance_destroy2(inst, 0);
-}
-
-static inline void
-instance_destroy(struct nfqnl_instance *inst)
-{
- _instance_destroy2(inst, 1);
-}
-
-
-
-static void
-issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
-{
- QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
-
- /* TCP input path (and probably other bits) assume to be called
- * from softirq context, not from syscall, like issue_verdict is
- * called. TCP input path deadlocks with locks taken from timer
- * softirq, e.g. We therefore emulate this by local_bh_disable() */
-
- local_bh_disable();
- nf_reinject(entry->skb, entry->info, verdict);
- local_bh_enable();
-
- kfree(entry);
-}
-
-static inline void
-__enqueue_entry(struct nfqnl_instance *queue,
- struct nfqnl_queue_entry *entry)
-{
- list_add(&entry->list, &queue->queue_list);
- queue->queue_total++;
-}
-
-/*
- * Find and return a queued entry matched by cmpfn, or return the last
- * entry if cmpfn is NULL.
- */
-static inline struct nfqnl_queue_entry *
-__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
- unsigned long data)
-{
- struct list_head *p;
-
- list_for_each_prev(p, &queue->queue_list) {
- struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
-
- if (!cmpfn || cmpfn(entry, data))
- return entry;
- }
- return NULL;
-}
-
-static inline void
-__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
-{
- list_del(&entry->list);
- q->queue_total--;
-}
-
-static inline struct nfqnl_queue_entry *
-__find_dequeue_entry(struct nfqnl_instance *queue,
- nfqnl_cmpfn cmpfn, unsigned long data)
-{
- struct nfqnl_queue_entry *entry;
-
- entry = __find_entry(queue, cmpfn, data);
- if (entry == NULL)
- return NULL;
-
- __dequeue_entry(queue, entry);
- return entry;
-}
-
-
-static inline void
-__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
-{
- struct nfqnl_queue_entry *entry;
-
- while ((entry = __find_dequeue_entry(queue, NULL, 0)))
- issue_verdict(entry, verdict);
-}
-
-static inline int
-__nfqnl_set_mode(struct nfqnl_instance *queue,
- unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch (mode) {
- case NFQNL_COPY_NONE:
- case NFQNL_COPY_META:
- queue->copy_mode = mode;
- queue->copy_range = 0;
- break;
-
- case NFQNL_COPY_PACKET:
- queue->copy_mode = mode;
- /* we're using struct nfattr which has 16bit nfa_len */
- if (range > 0xffff)
- queue->copy_range = 0xffff;
- else
- queue->copy_range = range;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static struct nfqnl_queue_entry *
-find_dequeue_entry(struct nfqnl_instance *queue,
- nfqnl_cmpfn cmpfn, unsigned long data)
-{
- struct nfqnl_queue_entry *entry;
-
- spin_lock_bh(&queue->lock);
- entry = __find_dequeue_entry(queue, cmpfn, data);
- spin_unlock_bh(&queue->lock);
-
- return entry;
-}
-
-static void
-nfqnl_flush(struct nfqnl_instance *queue, int verdict)
-{
- spin_lock_bh(&queue->lock);
- __nfqnl_flush(queue, verdict);
- spin_unlock_bh(&queue->lock);
-}
-
-static struct sk_buff *
-nfqnl_build_packet_message(struct nfqnl_instance *queue,
- struct nfqnl_queue_entry *entry, int *errp)
-{
- unsigned char *old_tail;
- size_t size;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct nfqnl_msg_packet_hdr pmsg;
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- struct nf_info *entinf = entry->info;
- struct sk_buff *entskb = entry->skb;
- struct net_device *indev;
- struct net_device *outdev;
- unsigned int tmp_uint;
-
- QDEBUG("entered\n");
-
- /* all macros expand to constant values at compile time */
- size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
- + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
- + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
-#ifdef CONFIG_BRIDGE_NETFILTER
- + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
- + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
-#endif
- + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */
- + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
- + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
-
- outdev = entinf->outdev;
-
- spin_lock_bh(&queue->lock);
-
- switch (queue->copy_mode) {
- case NFQNL_COPY_META:
- case NFQNL_COPY_NONE:
- data_len = 0;
- break;
-
- case NFQNL_COPY_PACKET:
- if (entskb->ip_summed == CHECKSUM_HW &&
- (*errp = skb_checksum_help(entskb,
- outdev == NULL))) {
- spin_unlock_bh(&queue->lock);
- return NULL;
- }
- if (queue->copy_range == 0
- || queue->copy_range > entskb->len)
- data_len = entskb->len;
- else
- data_len = queue->copy_range;
-
- size += NLMSG_SPACE(data_len);
- break;
-
- default:
- *errp = -EINVAL;
- spin_unlock_bh(&queue->lock);
- return NULL;
- }
-
- spin_unlock_bh(&queue->lock);
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail= skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0,
- NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
- sizeof(struct nfgenmsg));
- nfmsg = NLMSG_DATA(nlh);
- nfmsg->nfgen_family = entinf->pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(queue->queue_num);
-
- pmsg.packet_id = htonl(entry->id);
- pmsg.hw_protocol = htons(entskb->protocol);
- pmsg.hook = entinf->hook;
-
- NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
-
- indev = entinf->indev;
- if (indev) {
- tmp_uint = htonl(indev->ifindex);
-#ifndef CONFIG_BRIDGE_NETFILTER
- NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
-#else
- if (entinf->pf == PF_BRIDGE) {
- /* Case 1: indev is physical input device, we need to
- * look for bridge group (when called from
- * netfilter_bridge) */
- NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint),
- &tmp_uint);
- /* this is the bridge group "brX" */
- tmp_uint = htonl(indev->br_port->br->dev->ifindex);
- NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
- &tmp_uint);
- } else {
- /* Case 2: indev is bridge group, we need to look for
- * physical device (when called from ipv4) */
- NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
- &tmp_uint);
- if (entskb->nf_bridge
- && entskb->nf_bridge->physindev) {
- tmp_uint = htonl(entskb->nf_bridge->physindev->ifindex);
- NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
- sizeof(tmp_uint), &tmp_uint);
- }
- }
-#endif
- }
-
- if (outdev) {
- tmp_uint = htonl(outdev->ifindex);
-#ifndef CONFIG_BRIDGE_NETFILTER
- NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
-#else
- if (entinf->pf == PF_BRIDGE) {
- /* Case 1: outdev is physical output device, we need to
- * look for bridge group (when called from
- * netfilter_bridge) */
- NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
- &tmp_uint);
- /* this is the bridge group "brX" */
- tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
- NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
- &tmp_uint);
- } else {
- /* Case 2: outdev is bridge group, we need to look for
- * physical output device (when called from ipv4) */
- NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
- &tmp_uint);
- if (entskb->nf_bridge
- && entskb->nf_bridge->physoutdev) {
- tmp_uint = htonl(entskb->nf_bridge->physoutdev->ifindex);
- NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
- sizeof(tmp_uint), &tmp_uint);
- }
- }
-#endif
- }
-
- if (entskb->nfmark) {
- tmp_uint = htonl(entskb->nfmark);
- NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
- }
-
- if (indev && entskb->dev
- && entskb->dev->hard_header_parse) {
- struct nfqnl_msg_packet_hw phw;
-
- phw.hw_addrlen =
- entskb->dev->hard_header_parse(entskb,
- phw.hw_addr);
- phw.hw_addrlen = htons(phw.hw_addrlen);
- NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
- }
-
- if (entskb->tstamp.off_sec) {
- struct nfqnl_msg_packet_timestamp ts;
-
- ts.sec = cpu_to_be64(entskb->tstamp.off_sec);
- ts.usec = cpu_to_be64(entskb->tstamp.off_usec);
-
- NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
- }
-
- if (data_len) {
- struct nfattr *nfa;
- int size = NFA_LENGTH(data_len);
-
- if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
- printk(KERN_WARNING "nf_queue: no tailroom!\n");
- goto nlmsg_failure;
- }
-
- nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
- nfa->nfa_type = NFQA_PAYLOAD;
- nfa->nfa_len = size;
-
- if (skb_copy_bits(entskb, 0, NFA_DATA(nfa), data_len))
- BUG();
- }
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
-nfattr_failure:
- if (skb)
- kfree_skb(skb);
- *errp = -EINVAL;
- if (net_ratelimit())
- printk(KERN_ERR "nf_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
- unsigned int queuenum, void *data)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
- struct nfqnl_instance *queue;
- struct nfqnl_queue_entry *entry;
-
- QDEBUG("entered\n");
-
- queue = instance_lookup_get(queuenum);
- if (!queue) {
- QDEBUG("no queue instance matching\n");
- return -EINVAL;
- }
-
- if (queue->copy_mode == NFQNL_COPY_NONE) {
- QDEBUG("mode COPY_NONE, aborting\n");
- status = -EAGAIN;
- goto err_out_put;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL) {
- if (net_ratelimit())
- printk(KERN_ERR
- "nf_queue: OOM in nfqnl_enqueue_packet()\n");
- status = -ENOMEM;
- goto err_out_put;
- }
-
- entry->info = info;
- entry->skb = skb;
- entry->id = atomic_inc_return(&queue->id_sequence);
-
- nskb = nfqnl_build_packet_message(queue, entry, &status);
- if (nskb == NULL)
- goto err_out_free;
-
- spin_lock_bh(&queue->lock);
-
- if (!queue->peer_pid)
- goto err_out_free_nskb;
-
- if (queue->queue_total >= queue->queue_maxlen) {
- queue->queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk(KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n",
- queue->queue_total, queue->queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* nfnetlink_unicast will either free the nskb or add it to a socket */
- status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue->queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __enqueue_entry(queue, entry);
-
- spin_unlock_bh(&queue->lock);
- instance_put(queue);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue->lock);
-
-err_out_free:
- kfree(entry);
-err_out_put:
- instance_put(queue);
- return status;
-}
-
-static int
-nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
-{
- int diff;
-
- diff = data_len - e->skb->len;
- if (diff < 0)
- skb_trim(e->skb, data_len);
- else if (diff > 0) {
- if (data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- struct sk_buff *newskb;
-
- newskb = skb_copy_expand(e->skb,
- skb_headroom(e->skb),
- diff,
- GFP_ATOMIC);
- if (newskb == NULL) {
- printk(KERN_WARNING "ip_queue: OOM "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- if (e->skb->sk)
- skb_set_owner_w(newskb, e->skb->sk);
- kfree_skb(e->skb);
- e->skb = newskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(&e->skb, data_len))
- return -ENOMEM;
- memcpy(e->skb->data, data, data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
- return 0;
-}
-
-static inline int
-id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
-{
- return (id == e->id);
-}
-
-static int
-nfqnl_set_mode(struct nfqnl_instance *queue,
- unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue->lock);
- status = __nfqnl_set_mode(queue, mode, range);
- spin_unlock_bh(&queue->lock);
-
- return status;
-}
-
-static int
-dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
-{
- struct nf_info *entinf = entry->info;
-
- if (entinf->indev)
- if (entinf->indev->ifindex == ifindex)
- return 1;
-
- if (entinf->outdev)
- if (entinf->outdev->ifindex == ifindex)
- return 1;
-
- return 0;
-}
-
-/* drop all packets with either indev or outdev == ifindex from all queue
- * instances */
-static void
-nfqnl_dev_drop(int ifindex)
-{
- int i;
-
- QDEBUG("entering for ifindex %u\n", ifindex);
-
- /* this only looks like we have to hold the readlock for a way too long
- * time, issue_verdict(), nf_reinject(), ... - but we always only
- * issue NF_DROP, which is processed directly in nf_reinject() */
- read_lock_bh(&instances_lock);
-
- for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp;
- struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
-
- hlist_for_each_entry(inst, tmp, head, hlist) {
- struct nfqnl_queue_entry *entry;
- while ((entry = find_dequeue_entry(inst, dev_cmp,
- ifindex)) != NULL)
- issue_verdict(entry, NF_DROP);
- }
- }
-
- read_unlock_bh(&instances_lock);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static int
-nfqnl_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- nfqnl_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block nfqnl_dev_notifier = {
- .notifier_call = nfqnl_rcv_dev_event,
-};
-
-static int
-nfqnl_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE &&
- n->protocol == NETLINK_NETFILTER && n->pid) {
- int i;
-
- /* destroy all instances for this pid */
- write_lock_bh(&instances_lock);
- for (i = 0; i < INSTANCE_BUCKETS; i++) {
- struct hlist_node *tmp, *t2;
- struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
-
- hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
- if (n->pid == inst->peer_pid)
- __instance_destroy(inst);
- }
- }
- write_unlock_bh(&instances_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block nfqnl_rtnl_notifier = {
- .notifier_call = nfqnl_rcv_nl_event,
-};
-
-static const int nfqa_verdict_min[NFQA_MAX] = {
- [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr),
- [NFQA_MARK-1] = sizeof(u_int32_t),
- [NFQA_PAYLOAD-1] = 0,
-};
-
-static int
-nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
-{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
-
- struct nfqnl_msg_verdict_hdr *vhdr;
- struct nfqnl_instance *queue;
- unsigned int verdict;
- struct nfqnl_queue_entry *entry;
- int err;
-
- if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
- QDEBUG("bad attribute size\n");
- return -EINVAL;
- }
-
- queue = instance_lookup_get(queue_num);
- if (!queue)
- return -ENODEV;
-
- if (queue->peer_pid != NETLINK_CB(skb).pid) {
- err = -EPERM;
- goto err_out_put;
- }
-
- if (!nfqa[NFQA_VERDICT_HDR-1]) {
- err = -EINVAL;
- goto err_out_put;
- }
-
- vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
- verdict = ntohl(vhdr->verdict);
-
- if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
- err = -EINVAL;
- goto err_out_put;
- }
-
- entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
- if (entry == NULL) {
- err = -ENOENT;
- goto err_out_put;
- }
-
- if (nfqa[NFQA_PAYLOAD-1]) {
- if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
- NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
- verdict = NF_DROP;
- }
-
- if (nfqa[NFQA_MARK-1])
- entry->skb->nfmark = ntohl(*(u_int32_t *)
- NFA_DATA(nfqa[NFQA_MARK-1]));
-
- issue_verdict(entry, verdict);
- instance_put(queue);
- return 0;
-
-err_out_put:
- instance_put(queue);
- return err;
-}
-
-static int
-nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
-{
- return -ENOTSUPP;
-}
-
-static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
- [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd),
- [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params),
-};
-
-static struct nf_queue_handler nfqh = {
- .name = "nf_queue",
- .outfn = &nfqnl_enqueue_packet,
-};
-
-static int
-nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
-{
- struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
- struct nfqnl_instance *queue;
- int ret = 0;
-
- QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
-
- if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
- QDEBUG("bad attribute size\n");
- return -EINVAL;
- }
-
- queue = instance_lookup_get(queue_num);
- if (nfqa[NFQA_CFG_CMD-1]) {
- struct nfqnl_msg_config_cmd *cmd;
- cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
- QDEBUG("found CFG_CMD\n");
-
- switch (cmd->command) {
- case NFQNL_CFG_CMD_BIND:
- if (queue)
- return -EBUSY;
-
- queue = instance_create(queue_num, NETLINK_CB(skb).pid);
- if (!queue)
- return -EINVAL;
- break;
- case NFQNL_CFG_CMD_UNBIND:
- if (!queue)
- return -ENODEV;
-
- if (queue->peer_pid != NETLINK_CB(skb).pid) {
- ret = -EPERM;
- goto out_put;
- }
-
- instance_destroy(queue);
- break;
- case NFQNL_CFG_CMD_PF_BIND:
- QDEBUG("registering queue handler for pf=%u\n",
- ntohs(cmd->pf));
- ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
- break;
- case NFQNL_CFG_CMD_PF_UNBIND:
- QDEBUG("unregistering queue handler for pf=%u\n",
- ntohs(cmd->pf));
- /* This is a bug and a feature. We can unregister
- * other handlers(!) */
- ret = nf_unregister_queue_handler(ntohs(cmd->pf));
- break;
- default:
- ret = -EINVAL;
- break;
- }
- } else {
- if (!queue) {
- QDEBUG("no config command, and no instance ENOENT\n");
- ret = -ENOENT;
- goto out_put;
- }
-
- if (queue->peer_pid != NETLINK_CB(skb).pid) {
- QDEBUG("no config command, and wrong pid\n");
- ret = -EPERM;
- goto out_put;
- }
- }
-
- if (nfqa[NFQA_CFG_PARAMS-1]) {
- struct nfqnl_msg_config_params *params;
- params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
-
- nfqnl_set_mode(queue, params->copy_mode,
- ntohl(params->copy_range));
- }
-
-out_put:
- instance_put(queue);
- return ret;
-}
-
-static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
- [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX, },
-};
-
-static struct nfnetlink_subsystem nfqnl_subsys = {
- .name = "nf_queue",
- .subsys_id = NFNL_SUBSYS_QUEUE,
- .cb_count = NFQNL_MSG_MAX,
- .cb = nfqnl_cb,
-};
-
-#ifdef CONFIG_PROC_FS
-struct iter_state {
- unsigned int bucket;
-};
-
-static struct hlist_node *get_first(struct seq_file *seq)
-{
- struct iter_state *st = seq->private;
-
- if (!st)
- return NULL;
-
- for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
- }
- return NULL;
-}
-
-static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
-{
- struct iter_state *st = seq->private;
-
- h = h->next;
- while (!h) {
- if (++st->bucket >= INSTANCE_BUCKETS)
- return NULL;
-
- h = instance_table[st->bucket].first;
- }
- return h;
-}
-
-static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_node *head;
- head = get_first(seq);
-
- if (head)
- while (pos && (head = get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
-}
-
-static void *seq_start(struct seq_file *seq, loff_t *pos)
-{
- read_lock_bh(&instances_lock);
- return get_idx(seq, *pos);
-}
-
-static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- (*pos)++;
- return get_next(s, v);
-}
-
-static void seq_stop(struct seq_file *s, void *v)
-{
- read_unlock_bh(&instances_lock);
-}
-
-static int seq_show(struct seq_file *s, void *v)
-{
- const struct nfqnl_instance *inst = v;
-
- return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
- inst->queue_num,
- inst->peer_pid, inst->queue_total,
- inst->copy_mode, inst->copy_range,
- inst->queue_dropped, inst->queue_user_dropped,
- atomic_read(&inst->id_sequence),
- atomic_read(&inst->use));
-}
-
-static struct seq_operations nfqnl_seq_ops = {
- .start = seq_start,
- .next = seq_next,
- .stop = seq_stop,
- .show = seq_show,
-};
-
-static int nfqnl_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- struct iter_state *is;
- int ret;
-
- is = kzalloc(sizeof(*is), GFP_KERNEL);
- if (!is)
- return -ENOMEM;
- ret = seq_open(file, &nfqnl_seq_ops);
- if (ret < 0)
- goto out_free;
- seq = file->private_data;
- seq->private = is;
- return ret;
-out_free:
- kfree(is);
- return ret;
-}
-
-static struct file_operations nfqnl_file_ops = {
- .owner = THIS_MODULE,
- .open = nfqnl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-#endif /* PROC_FS */
-
-static int
-init_or_cleanup(int init)
-{
- int i, status = -ENOMEM;
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *proc_nfqueue;
-#endif
-
- if (!init)
- goto cleanup;
-
- for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
-
- netlink_register_notifier(&nfqnl_rtnl_notifier);
- status = nfnetlink_subsys_register(&nfqnl_subsys);
- if (status < 0) {
- printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
- proc_net_netfilter);
- if (!proc_nfqueue)
- goto cleanup_subsys;
- proc_nfqueue->proc_fops = &nfqnl_file_ops;
-#endif
-
- register_netdevice_notifier(&nfqnl_dev_notifier);
-
- return status;
-
-cleanup:
- nf_unregister_queue_handlers(&nfqh);
- unregister_netdevice_notifier(&nfqnl_dev_notifier);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-cleanup_subsys:
-#endif
- nfnetlink_subsys_unregister(&nfqnl_subsys);
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&nfqnl_rtnl_notifier);
- return status;
-}
-
-static int __init init(void)
-{
-
- return init_or_cleanup(1);
-}
-
-static void __exit fini(void)
-{
- init_or_cleanup(0);
-}
-
-MODULE_DESCRIPTION("netfilter packet queue handler");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
-
-module_init(init);
-module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
new file mode 100644
index 00000000000..108120f216b
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -0,0 +1,1352 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfnetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2007 by Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+#include <linux/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
+ * includes the header length. Thus, the maximum packet length that we
+ * support is 65531 bytes. We send truncated packets if the specified length
+ * is larger than that. Userspace can check for presence of NFQA_CAP_LEN
+ * attribute to detect truncation.
+ */
+#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
+
+struct nfqnl_instance {
+ struct hlist_node hlist; /* global list of queues */
+ struct rcu_head rcu;
+
+ int peer_portid;
+ unsigned int queue_maxlen;
+ unsigned int copy_range;
+ unsigned int queue_dropped;
+ unsigned int queue_user_dropped;
+
+
+ u_int16_t queue_num; /* number of this queue */
+ u_int8_t copy_mode;
+ u_int32_t flags; /* Set using NFQA_CFG_FLAGS */
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+ spinlock_t lock;
+ unsigned int queue_total;
+ unsigned int id_sequence; /* 'sequence' of pkt ids */
+ struct list_head queue_list; /* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static int nfnl_queue_net_id __read_mostly;
+
+#define INSTANCE_BUCKETS 16
+struct nfnl_queue_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_queue_net_id);
+}
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+ return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
+{
+ struct hlist_head *head;
+ struct nfqnl_instance *inst;
+
+ head = &q->instance_table[instance_hashfn(queue_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
+ if (inst->queue_num == queue_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static struct nfqnl_instance *
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num,
+ int portid)
+{
+ struct nfqnl_instance *inst;
+ unsigned int h;
+ int err;
+
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ inst->queue_num = queue_num;
+ inst->peer_portid = portid;
+ inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+ inst->copy_range = NFQNL_MAX_COPY_RANGE;
+ inst->copy_mode = NFQNL_COPY_NONE;
+ spin_lock_init(&inst->lock);
+ INIT_LIST_HEAD(&inst->queue_list);
+
+ if (!try_module_get(THIS_MODULE)) {
+ err = -EAGAIN;
+ goto out_free;
+ }
+
+ h = instance_hashfn(queue_num);
+ hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
+
+ spin_unlock(&q->instances_lock);
+
+ return inst;
+
+out_free:
+ kfree(inst);
+out_unlock:
+ spin_unlock(&q->instances_lock);
+ return ERR_PTR(err);
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+ unsigned long data);
+
+static void
+instance_destroy_rcu(struct rcu_head *head)
+{
+ struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
+ rcu);
+
+ nfqnl_flush(inst, NULL, 0);
+ kfree(inst);
+ module_put(THIS_MODULE);
+}
+
+static void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+ hlist_del_rcu(&inst->hlist);
+ call_rcu(&inst->rcu, instance_destroy_rcu);
+}
+
+static void
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
+{
+ spin_lock(&q->instances_lock);
+ __instance_destroy(inst);
+ spin_unlock(&q->instances_lock);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_add_tail(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+}
+
+static void
+__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_del(&entry->list);
+ queue->queue_total--;
+}
+
+static struct nf_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+{
+ struct nf_queue_entry *entry = NULL, *i;
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry(i, &queue->queue_list, list) {
+ if (i->id == id) {
+ entry = i;
+ break;
+ }
+ }
+
+ if (entry)
+ __dequeue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+
+ return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nf_queue_entry *entry, *next;
+
+ spin_lock_bh(&queue->lock);
+ list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
+ if (!cmpfn || cmpfn(entry, data)) {
+ list_del(&entry->list);
+ queue->queue_total--;
+ nf_reinject(entry, NF_DROP);
+ }
+ }
+ spin_unlock_bh(&queue->lock);
+}
+
+static int
+nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
+ bool csum_verify)
+{
+ __u32 flags = 0;
+
+ if (packet->ip_summed == CHECKSUM_PARTIAL)
+ flags = NFQA_SKB_CSUMNOTREADY;
+ else if (csum_verify)
+ flags = NFQA_SKB_CSUM_NOTVERIFIED;
+
+ if (skb_is_gso(packet))
+ flags |= NFQA_SKB_GSO;
+
+ return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
+}
+
+static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
+{
+ const struct cred *cred;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return 0;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ cred = sk->sk_socket->file->f_cred;
+ if (nla_put_be32(skb, NFQA_UID,
+ htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFQA_GID,
+ htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
+ goto nla_put_failure;
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+
+nla_put_failure:
+ read_unlock_bh(&sk->sk_callback_lock);
+ return -1;
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry,
+ __be32 **packet_id_ptr)
+{
+ size_t size;
+ size_t data_len = 0, cap_len = 0;
+ unsigned int hlen = 0;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ struct nfqnl_msg_packet_hdr *pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct sk_buff *entskb = entry->skb;
+ struct net_device *indev;
+ struct net_device *outdev;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ bool csum_verify;
+
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ + nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ + nla_total_size(sizeof(u_int32_t)); /* cap_len */
+
+ if (entskb->tstamp.tv64)
+ size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+
+ if (entry->hook <= NF_INET_FORWARD ||
+ (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
+ csum_verify = !skb_csum_unnecessary(entskb);
+ else
+ csum_verify = false;
+
+ outdev = entry->outdev;
+
+ switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
+ case NFQNL_COPY_META:
+ case NFQNL_COPY_NONE:
+ break;
+
+ case NFQNL_COPY_PACKET:
+ if (!(queue->flags & NFQA_CFG_F_GSO) &&
+ entskb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(entskb))
+ return NULL;
+
+ data_len = ACCESS_ONCE(queue->copy_range);
+ if (data_len > entskb->len)
+ data_len = entskb->len;
+
+ hlen = skb_zerocopy_headlen(entskb);
+ hlen = min_t(unsigned int, hlen, data_len);
+ size += sizeof(struct nlattr) + hlen;
+ cap_len = entskb->len;
+ break;
+ }
+
+ if (queue->flags & NFQA_CFG_F_CONNTRACK)
+ ct = nfqnl_ct_get(entskb, &size, &ctinfo);
+
+ if (queue->flags & NFQA_CFG_F_UID_GID) {
+ size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ + nla_total_size(sizeof(u_int32_t))); /* gid */
+ }
+
+ skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+ GFP_ATOMIC);
+ if (!skb) {
+ skb_tx_error(entskb);
+ return NULL;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0,
+ NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh) {
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ return NULL;
+ }
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = entry->pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(queue->queue_num);
+
+ nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
+ pmsg = nla_data(nla);
+ pmsg->hw_protocol = entskb->protocol;
+ pmsg->hook = entry->hook;
+ *packet_id_ptr = &pmsg->packet_id;
+
+ indev = entry->indev;
+ if (indev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->pf == PF_BRIDGE) {
+ /* Case 1: indev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+ if (entskb->nf_bridge && entskb->nf_bridge->physindev &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(entskb->nf_bridge->physindev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (outdev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ /* Case 2: outdev is bridge group, we need to look for
+ * physical output device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+ if (entskb->nf_bridge && entskb->nf_bridge->physoutdev &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(entskb->nf_bridge->physoutdev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (entskb->mark &&
+ nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
+ goto nla_put_failure;
+
+ if (indev && entskb->dev &&
+ entskb->mac_header != entskb->network_header) {
+ struct nfqnl_msg_packet_hw phw;
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(entskb, phw.hw_addr);
+ if (len) {
+ phw.hw_addrlen = htons(len);
+ if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
+ }
+ }
+
+ if (entskb->tstamp.tv64) {
+ struct nfqnl_msg_packet_timestamp ts;
+ struct timeval tv = ktime_to_timeval(entskb->tstamp);
+ ts.sec = cpu_to_be64(tv.tv_sec);
+ ts.usec = cpu_to_be64(tv.tv_usec);
+
+ if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk &&
+ nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ goto nla_put_failure;
+
+ if (cap_len > data_len &&
+ nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+ goto nla_put_failure;
+
+ if (nfqnl_put_packet_info(skb, entskb, csum_verify))
+ goto nla_put_failure;
+
+ if (data_len) {
+ struct nlattr *nla;
+
+ if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+ goto nla_put_failure;
+
+ nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
+ nla->nla_type = NFQA_PAYLOAD;
+ nla->nla_len = nla_attr_size(data_len);
+
+ if (skb_zerocopy(skb, entskb, data_len, hlen))
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb->len;
+ return skb;
+
+nla_put_failure:
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ net_err_ratelimited("nf_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry)
+{
+ struct sk_buff *nskb;
+ int err = -ENOBUFS;
+ __be32 *packet_id_ptr;
+ int failopen = 0;
+
+ nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);
+ if (nskb == NULL) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ spin_lock_bh(&queue->lock);
+
+ if (queue->queue_total >= queue->queue_maxlen) {
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_dropped++;
+ net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+ queue->queue_total);
+ }
+ goto err_out_free_nskb;
+ }
+ entry->id = ++queue->id_sequence;
+ *packet_id_ptr = htonl(entry->id);
+
+ /* nfnetlink_unicast will either free the nskb or add it to a socket */
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ if (err < 0) {
+ queue->queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __enqueue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+ return 0;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+err_out_unlock:
+ spin_unlock_bh(&queue->lock);
+ if (failopen)
+ nf_reinject(entry, NF_ACCEPT);
+err_out:
+ return err;
+}
+
+static struct nf_queue_entry *
+nf_queue_entry_dup(struct nf_queue_entry *e)
+{
+ struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
+ if (entry) {
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+ kfree(entry);
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+/* When called from bridge netfilter, skb->data must point to MAC header
+ * before calling skb_gso_segment(). Else, original MAC header is lost
+ * and segmented skbs will be sent to wrong destination.
+ */
+static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_push(skb, skb->network_header - skb->mac_header);
+}
+
+static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_pull(skb, skb->network_header - skb->mac_header);
+}
+#else
+#define nf_bridge_adjust_skb_data(s) do {} while (0)
+#define nf_bridge_adjust_segmented_data(s) do {} while (0)
+#endif
+
+static void free_entry(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+
+static int
+__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
+ struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ int ret = -ENOMEM;
+ struct nf_queue_entry *entry_seg;
+
+ nf_bridge_adjust_segmented_data(skb);
+
+ if (skb->next == NULL) { /* last packet, no need to copy entry */
+ struct sk_buff *gso_skb = entry->skb;
+ entry->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry);
+ if (ret)
+ entry->skb = gso_skb;
+ return ret;
+ }
+
+ skb->next = NULL;
+
+ entry_seg = nf_queue_entry_dup(entry);
+ if (entry_seg) {
+ entry_seg->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
+ if (ret)
+ free_entry(entry_seg);
+ }
+ return ret;
+}
+
+static int
+nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+ unsigned int queued;
+ struct nfqnl_instance *queue;
+ struct sk_buff *skb, *segs;
+ int err = -ENOBUFS;
+ struct net *net = dev_net(entry->indev ?
+ entry->indev : entry->outdev);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ queue = instance_lookup(q, queuenum);
+ if (!queue)
+ return -ESRCH;
+
+ if (queue->copy_mode == NFQNL_COPY_NONE)
+ return -EINVAL;
+
+ skb = entry->skb;
+
+ switch (entry->pf) {
+ case NFPROTO_IPV4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case NFPROTO_IPV6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ return __nfqnl_enqueue_packet(net, queue, entry);
+
+ nf_bridge_adjust_skb_data(skb);
+ segs = skb_gso_segment(skb, 0);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * mean 'ignore this hook'.
+ */
+ if (IS_ERR(segs))
+ goto out_err;
+ queued = 0;
+ err = 0;
+ do {
+ struct sk_buff *nskb = segs->next;
+ if (err == 0)
+ err = __nfqnl_enqueue_packet_gso(net, queue,
+ segs, entry);
+ if (err == 0)
+ queued++;
+ else
+ kfree_skb(segs);
+ segs = nskb;
+ } while (segs);
+
+ if (queued) {
+ if (err) /* some segments are already queued */
+ free_entry(entry);
+ kfree_skb(skb);
+ return 0;
+ }
+ out_err:
+ nf_bridge_adjust_segmented_data(skb);
+ return err;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+{
+ struct sk_buff *nskb;
+
+ if (diff < 0) {
+ if (pskb_trim(e->skb, data_len))
+ return -ENOMEM;
+ } else if (diff > 0) {
+ if (data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+ diff, GFP_ATOMIC);
+ if (!nskb) {
+ printk(KERN_WARNING "nf_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ kfree_skb(e->skb);
+ e->skb = nskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_make_writable(e->skb, data_len))
+ return -ENOMEM;
+ skb_copy_to_linear_data(e->skb, data, data_len);
+ e->skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&queue->lock);
+ switch (mode) {
+ case NFQNL_COPY_NONE:
+ case NFQNL_COPY_META:
+ queue->copy_mode = mode;
+ queue->copy_range = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ queue->copy_mode = mode;
+ if (range == 0 || range > NFQNL_MAX_COPY_RANGE)
+ queue->copy_range = NFQNL_MAX_COPY_RANGE;
+ else
+ queue->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ spin_unlock_bh(&queue->lock);
+
+ return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->indev)
+ if (entry->indev->ifindex == ifindex)
+ return 1;
+ if (entry->outdev)
+ if (entry->outdev->ifindex == ifindex)
+ return 1;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ if (entry->skb->nf_bridge->physindev &&
+ entry->skb->nf_bridge->physindev->ifindex == ifindex)
+ return 1;
+ if (entry->skb->nf_bridge->physoutdev &&
+ entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(struct net *net, int ifindex)
+{
+ int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ rcu_read_lock();
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_rcu(inst, head, hlist)
+ nfqnl_flush(inst, dev_cmp, ifindex);
+ }
+
+ rcu_read_unlock();
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ nfqnl_dev_drop(dev_net(dev), dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+ .notifier_call = nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
+
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+ int i;
+
+ /* destroy all instances for this portid */
+ spin_lock(&q->instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *t2;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
+ __instance_destroy(inst);
+ }
+ }
+ spin_unlock(&q->instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+ .notifier_call = nfqnl_rcv_nl_event,
+};
+
+static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
+ [NFQA_CT] = { .type = NLA_UNSPEC },
+ [NFQA_EXP] = { .type = NLA_UNSPEC },
+};
+
+static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+};
+
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid)
+{
+ struct nfqnl_instance *queue;
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ return ERR_PTR(-ENODEV);
+
+ if (queue->peer_portid != nlportid)
+ return ERR_PTR(-EPERM);
+
+ return queue;
+}
+
+static struct nfqnl_msg_verdict_hdr*
+verdicthdr_get(const struct nlattr * const nfqa[])
+{
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ unsigned int verdict;
+
+ if (!nfqa[NFQA_VERDICT_HDR])
+ return NULL;
+
+ vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
+ verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK;
+ if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN)
+ return NULL;
+ return vhdr;
+}
+
+static int nfq_id_after(unsigned int id, unsigned int max)
+{
+ return (int)(id - max) > 0;
+}
+
+static int
+nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_queue_entry *entry, *tmp;
+ unsigned int verdict, maxid;
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ LIST_HEAD(batch_list);
+ u16 queue_num = ntohs(nfmsg->res_id);
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+ maxid = ntohl(vhdr->id);
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
+ if (nfq_id_after(entry->id, maxid))
+ break;
+ __dequeue_entry(queue, entry);
+ list_add_tail(&entry->list, &batch_list);
+ }
+
+ spin_unlock_bh(&queue->lock);
+
+ if (list_empty(&batch_list))
+ return -ENOENT;
+
+ list_for_each_entry_safe(entry, tmp, &batch_list, list) {
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ nf_reinject(entry, verdict);
+ }
+ return 0;
+}
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ unsigned int verdict;
+ struct nf_queue_entry *entry;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nf_conn *ct = NULL;
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+
+ entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+ if (entry == NULL)
+ return -ENOENT;
+
+ if (nfqa[NFQA_CT]) {
+ ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
+ if (ct && nfqa[NFQA_EXP]) {
+ nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ }
+
+ if (nfqa[NFQA_PAYLOAD]) {
+ u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
+ int diff = payload_len - entry->skb->len;
+
+ if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
+ payload_len, entry, diff) < 0)
+ verdict = NF_DROP;
+
+ if (ct)
+ nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff);
+ }
+
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+
+ nf_reinject(entry, verdict);
+ return 0;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ return -ENOTSUPP;
+}
+
+static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
+ [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) },
+ [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) },
+};
+
+static const struct nf_queue_handler nfqh = {
+ .outfn = &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfqnl_instance *queue;
+ struct nfqnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int ret = 0;
+
+ if (nfqa[NFQA_CFG_CMD]) {
+ cmd = nla_data(nfqa[NFQA_CFG_CMD]);
+
+ /* Obsolete commands without queue context */
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_PF_BIND: return 0;
+ case NFQNL_CFG_CMD_PF_UNBIND: return 0;
+ }
+ }
+
+ rcu_read_lock();
+ queue = instance_lookup(q, queue_num);
+ if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
+ ret = -EPERM;
+ goto err_out_unlock;
+ }
+
+ if (cmd != NULL) {
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_BIND:
+ if (queue) {
+ ret = -EBUSY;
+ goto err_out_unlock;
+ }
+ queue = instance_create(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue)) {
+ ret = PTR_ERR(queue);
+ goto err_out_unlock;
+ }
+ break;
+ case NFQNL_CFG_CMD_UNBIND:
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ instance_destroy(q, queue);
+ break;
+ case NFQNL_CFG_CMD_PF_BIND:
+ case NFQNL_CFG_CMD_PF_UNBIND:
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ }
+
+ if (nfqa[NFQA_CFG_PARAMS]) {
+ struct nfqnl_msg_config_params *params;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ params = nla_data(nfqa[NFQA_CFG_PARAMS]);
+ nfqnl_set_mode(queue, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+ if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
+ __be32 *queue_maxlen;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
+ spin_lock_bh(&queue->lock);
+ queue->queue_maxlen = ntohl(*queue_maxlen);
+ spin_unlock_bh(&queue->lock);
+ }
+
+ if (nfqa[NFQA_CFG_FLAGS]) {
+ __u32 flags, mask;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+
+ if (!nfqa[NFQA_CFG_MASK]) {
+ /* A mask is needed to specify which flags are being
+ * changed.
+ */
+ ret = -EINVAL;
+ goto err_out_unlock;
+ }
+
+ flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
+ mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
+
+ if (flags >= NFQA_CFG_F_MAX) {
+ ret = -EOPNOTSUPP;
+ goto err_out_unlock;
+ }
+
+ spin_lock_bh(&queue->lock);
+ queue->flags &= ~mask;
+ queue->flags |= flags & mask;
+ spin_unlock_bh(&queue->lock);
+ }
+
+err_out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+ [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
+ .attr_count = NFQA_MAX, },
+ [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy },
+ [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy },
+ [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy },
+};
+
+static const struct nfnetlink_subsystem nfqnl_subsys = {
+ .name = "nf_queue",
+ .subsys_id = NFNL_SUBSYS_QUEUE,
+ .cb_count = NFQNL_MSG_MAX,
+ .cb = nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+ struct net *net;
+ struct nfnl_queue_net *q;
+
+ if (!st)
+ return NULL;
+
+ net = seq_file_net(seq);
+ q = nfnl_queue_pernet(net);
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&q->instance_table[st->bucket]))
+ return q->instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ h = h->next;
+ while (!h) {
+ struct nfnl_queue_net *q;
+
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ q = nfnl_queue_pernet(net);
+ h = q->instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+ return get_idx(s, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+ __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfqnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+ inst->queue_num,
+ inst->peer_portid, inst->queue_total,
+ inst->copy_mode, inst->copy_range,
+ inst->queue_dropped, inst->queue_user_dropped,
+ inst->id_sequence, 1);
+}
+
+static const struct seq_operations nfqnl_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &nfqnl_seq_ops,
+ sizeof(struct iter_state));
+}
+
+static const struct file_operations nfqnl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqnl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif /* PROC_FS */
+
+static int __net_init nfnl_queue_net_init(struct net *net)
+{
+ unsigned int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&q->instance_table[i]);
+
+ spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_queue", 0440,
+ net->nf.proc_netfilter, &nfqnl_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+ int status = -ENOMEM;
+
+ netlink_register_notifier(&nfqnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfqnl_subsys);
+ if (status < 0) {
+ pr_err("nf_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
+ goto cleanup_subsys;
+ }
+ register_netdevice_notifier(&nfqnl_dev_notifier);
+ nf_register_queue_handler(&nfqh);
+ return status;
+
+cleanup_subsys:
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ return status;
+}
+
+static void __exit nfnetlink_queue_fini(void)
+{
+ nf_unregister_queue_handler();
+ unregister_netdevice_notifier(&nfqnl_dev_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(nfnetlink_queue_init);
+module_exit(nfnetlink_queue_fini);
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
new file mode 100644
index 00000000000..96cac50e0d1
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -0,0 +1,113 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(entskb, ctinfo);
+ if (ct) {
+ if (!nf_ct_is_untracked(ct))
+ *size += nfq_ct->build_size(ct);
+ else
+ ct = NULL;
+ }
+ return ct;
+}
+
+struct nf_conn *
+nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(skb, ctinfo);
+ if (ct && !nf_ct_is_untracked(ct))
+ nfq_ct->parse(attr, ct);
+
+ return ct;
+}
+
+int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nlattr *nest_parms;
+ u_int32_t tmp;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return 0;
+
+ nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nfq_ct->build(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ tmp = ctinfo;
+ if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int diff)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return;
+
+ if ((ct->status & IPS_NAT_MASK) && diff)
+ nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+}
+
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+ u32 portid, u32 report)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ if (nf_ct_is_untracked(ct))
+ return 0;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return -EOPNOTSUPP;
+
+ return nfq_ct->attach_expect(attr, ct, portid, report);
+}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
new file mode 100644
index 00000000000..4fb6ee2c110
--- /dev/null
+++ b/net/netfilter/nft_bitwise.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitwise {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ u8 len;
+ struct nft_data mask;
+ struct nft_data xor;
+};
+
+static void nft_bitwise_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_data *src = &data[priv->sreg];
+ struct nft_data *dst = &data[priv->dreg];
+ unsigned int i;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) {
+ dst->data[i] = (src->data[i] & priv->mask.data[i]) ^
+ priv->xor.data[i];
+ }
+}
+
+static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
+ [NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_DREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_LEN] = { .type = NLA_U32 },
+ [NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
+ [NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
+};
+
+static int nft_bitwise_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise *priv = nft_expr_priv(expr);
+ struct nft_data_desc d1, d2;
+ int err;
+
+ if (tb[NFTA_BITWISE_SREG] == NULL ||
+ tb[NFTA_BITWISE_DREG] == NULL ||
+ tb[NFTA_BITWISE_LEN] == NULL ||
+ tb[NFTA_BITWISE_MASK] == NULL ||
+ tb[NFTA_BITWISE_XOR] == NULL)
+ return -EINVAL;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+
+ err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+ if (err < 0)
+ return err;
+ if (d1.len != priv->len)
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+ if (err < 0)
+ return err;
+ if (d2.len != priv->len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_bitwise_type;
+static const struct nft_expr_ops nft_bitwise_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
+ .eval = nft_bitwise_eval,
+ .init = nft_bitwise_init,
+ .dump = nft_bitwise_dump,
+};
+
+static struct nft_expr_type nft_bitwise_type __read_mostly = {
+ .name = "bitwise",
+ .ops = &nft_bitwise_ops,
+ .policy = nft_bitwise_policy,
+ .maxattr = NFTA_BITWISE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_bitwise_module_init(void)
+{
+ return nft_register_expr(&nft_bitwise_type);
+}
+
+void nft_bitwise_module_exit(void)
+{
+ nft_unregister_expr(&nft_bitwise_type);
+}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
new file mode 100644
index 00000000000..c39ed8d29df
--- /dev/null
+++ b/net/netfilter/nft_byteorder.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_byteorder {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ enum nft_byteorder_ops op:8;
+ u8 len;
+ u8 size;
+};
+
+static void nft_byteorder_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+ struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg];
+ union { u32 u32; u16 u16; } *s, *d;
+ unsigned int i;
+
+ s = (void *)src->data;
+ d = (void *)dst->data;
+
+ switch (priv->size) {
+ case 4:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = ntohl((__force __be32)s[i].u32);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = (__force __u32)htonl(s[i].u32);
+ break;
+ }
+ break;
+ case 2:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = ntohs((__force __be16)s[i].u16);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = (__force __u16)htons(s[i].u16);
+ break;
+ }
+ break;
+ }
+}
+
+static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
+ [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_byteorder_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_BYTEORDER_SREG] == NULL ||
+ tb[NFTA_BYTEORDER_DREG] == NULL ||
+ tb[NFTA_BYTEORDER_LEN] == NULL ||
+ tb[NFTA_BYTEORDER_SIZE] == NULL ||
+ tb[NFTA_BYTEORDER_OP] == NULL)
+ return -EINVAL;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP]));
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ case NFT_BYTEORDER_HTON:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+ if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data))
+ return -EINVAL;
+
+ priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+ switch (priv->size) {
+ case 2:
+ case 4:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_byteorder_type;
+static const struct nft_expr_ops nft_byteorder_ops = {
+ .type = &nft_byteorder_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
+ .eval = nft_byteorder_eval,
+ .init = nft_byteorder_init,
+ .dump = nft_byteorder_dump,
+};
+
+static struct nft_expr_type nft_byteorder_type __read_mostly = {
+ .name = "byteorder",
+ .ops = &nft_byteorder_ops,
+ .policy = nft_byteorder_policy,
+ .maxattr = NFTA_BYTEORDER_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_byteorder_module_init(void)
+{
+ return nft_register_expr(&nft_byteorder_type);
+}
+
+void nft_byteorder_module_exit(void)
+{
+ nft_unregister_expr(&nft_byteorder_type);
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
new file mode 100644
index 00000000000..e2b3f51c81f
--- /dev/null
+++ b/net/netfilter/nft_cmp.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_cmp_expr {
+ struct nft_data data;
+ enum nft_registers sreg:8;
+ u8 len;
+ enum nft_cmp_ops op:8;
+};
+
+static void nft_cmp_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ int d;
+
+ d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len);
+ switch (priv->op) {
+ case NFT_CMP_EQ:
+ if (d != 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_NEQ:
+ if (d == 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_LT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_LTE:
+ if (d > 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_GT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_GTE:
+ if (d < 0)
+ goto mismatch;
+ break;
+ }
+ return;
+
+mismatch:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = {
+ [NFTA_CMP_SREG] = { .type = NLA_U32 },
+ [NFTA_CMP_OP] = { .type = NLA_U32 },
+ [NFTA_CMP_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_cmp_type;
+static const struct nft_expr_ops nft_cmp_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
+ .eval = nft_cmp_eval,
+ .init = nft_cmp_init,
+ .dump = nft_cmp_dump,
+};
+
+static int nft_cmp_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ struct nft_data data;
+ u32 mask;
+ int err;
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+ desc.len *= BITS_PER_BYTE;
+
+ mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & mask;
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ goto nla_put_failure;
+
+ data.data[0] = priv->data;
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &data,
+ NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+const struct nft_expr_ops nft_cmp_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp_fast_init,
+ .dump = nft_cmp_fast_dump,
+};
+
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+ struct nft_data_desc desc;
+ struct nft_data data;
+ enum nft_registers sreg;
+ enum nft_cmp_ops op;
+ int err;
+
+ if (tb[NFTA_CMP_SREG] == NULL ||
+ tb[NFTA_CMP_OP] == NULL ||
+ tb[NFTA_CMP_DATA] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ err = nft_validate_input_register(sreg);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+ switch (op) {
+ case NFT_CMP_EQ:
+ case NFT_CMP_NEQ:
+ case NFT_CMP_LT:
+ case NFT_CMP_LTE:
+ case NFT_CMP_GT:
+ case NFT_CMP_GTE:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
+ return &nft_cmp_fast_ops;
+ else
+ return &nft_cmp_ops;
+}
+
+static struct nft_expr_type nft_cmp_type __read_mostly = {
+ .name = "cmp",
+ .select_ops = nft_cmp_select_ops,
+ .policy = nft_cmp_policy,
+ .maxattr = NFTA_CMP_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_cmp_module_init(void)
+{
+ return nft_register_expr(&nft_cmp_type);
+}
+
+void nft_cmp_module_exit(void)
+{
+ nft_unregister_expr(&nft_cmp_type);
+}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 00000000000..1840989092e
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,793 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <asm/uaccess.h> /* for set_fs */
+#include <net/netfilter/nf_tables.h>
+
+union nft_entry {
+ struct ipt_entry e4;
+ struct ip6t_entry e6;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+ par->target = xt;
+ par->targinfo = xt_info;
+ par->hotdrop = false;
+}
+
+static void nft_target_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ int ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+ ret = target->target(skb, &pkt->xt);
+
+ if (pkt->xt.hotdrop)
+ ret = NF_DROP;
+
+ switch(ret) {
+ case XT_CONTINUE:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ break;
+ default:
+ data[NFT_REG_VERDICT].verdict = ret;
+ break;
+ }
+ return;
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+ [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_INFO] = { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+ const struct nft_ctx *ctx,
+ struct xt_target *target, void *info,
+ union nft_entry *entry, u8 proto, bool inv)
+{
+ par->net = &init_net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ }
+ par->entryinfo = entry;
+ par->target = target;
+ par->targinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+ if (t->compat_from_user) {
+ int pad;
+
+ t->compat_from_user(out, in);
+ pad = XT_ALIGN(t->targetsize) - t->targetsize;
+ if (pad > 0)
+ memset(out + t->targetsize, 0, pad);
+ } else
+#endif
+ memcpy(out, in, XT_ALIGN(t->targetsize));
+}
+
+static inline int nft_compat_target_offset(struct xt_target *target)
+{
+#ifdef CONFIG_COMPAT
+ return xt_compat_target_offset(target);
+#else
+ return 0;
+#endif
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+ [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 },
+ [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv)
+{
+ struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 flags;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+ nft_rule_compat_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+ if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ return -EINVAL;
+ if (flags & NFT_RULE_COMPAT_F_INV)
+ *inv = true;
+
+ *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ return 0;
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct xt_tgchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+ u8 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+ ret = xt_check_target(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ /* The standard target cannot be used */
+ if (target->target == NULL) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ module_put(target->me);
+ return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgdtor_param par;
+
+ par.net = ctx->net;
+ par.target = target;
+ par.targinfo = info;
+ par.family = ctx->afi->family;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+
+ module_put(target->me);
+}
+
+static int
+target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
+{
+ int ret;
+
+#ifdef CONFIG_COMPAT
+ if (t->compat_to_user) {
+ mm_segment_t old_fs;
+ void *out;
+
+ out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC);
+ if (out == NULL)
+ return -ENOMEM;
+
+ /* We want to reuse existing compat_to_user */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ t->compat_to_user(out, in);
+ set_fs(old_fs);
+ ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out);
+ kfree(out);
+ } else
+#endif
+ ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in);
+
+ return ret;
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+ nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+ target_dump_info(skb, target, info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_target *target = expr->ops->data;
+ unsigned int hook_mask = 0;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (hook_mask & target->hooks)
+ return 0;
+
+ /* This target is being called from an invalid chain */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ bool ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+ ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+ if (pkt->xt.hotdrop) {
+ data[NFT_REG_VERDICT].verdict = NF_DROP;
+ return;
+ }
+
+ switch(ret) {
+ case true:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ break;
+ case false:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+ break;
+ }
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+ [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_INFO] = { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+ struct xt_match *match, void *info,
+ union nft_entry *entry, u8 proto, bool inv)
+{
+ par->net = &init_net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ }
+ par->entryinfo = entry;
+ par->match = match;
+ par->matchinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+ if (m->compat_from_user) {
+ int pad;
+
+ m->compat_from_user(out, in);
+ pad = XT_ALIGN(m->matchsize) - m->matchsize;
+ if (pad > 0)
+ memset(out + m->matchsize, 0, pad);
+ } else
+#endif
+ memcpy(out, in, XT_ALIGN(m->matchsize));
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct xt_mtchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+ u8 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+ ret = xt_check_match(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ module_put(match->me);
+ return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_match *match = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_mtdtor_param par;
+
+ par.net = ctx->net;
+ par.match = match;
+ par.matchinfo = info;
+ par.family = ctx->afi->family;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+
+ module_put(match->me);
+}
+
+static int
+match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in)
+{
+ int ret;
+
+#ifdef CONFIG_COMPAT
+ if (m->compat_to_user) {
+ mm_segment_t old_fs;
+ void *out;
+
+ out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC);
+ if (out == NULL)
+ return -ENOMEM;
+
+ /* We want to reuse existing compat_to_user */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ m->compat_to_user(out, in);
+ set_fs(old_fs);
+ ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out);
+ kfree(out);
+ } else
+#endif
+ ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in);
+
+ return ret;
+}
+
+static inline int nft_compat_match_offset(struct xt_match *match)
+{
+#ifdef CONFIG_COMPAT
+ return xt_compat_match_offset(match);
+#else
+ return 0;
+#endif
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+
+ if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+ nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+ match_dump_info(skb, match, info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_match *match = expr->ops->data;
+ unsigned int hook_mask = 0;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (hook_mask & match->hooks)
+ return 0;
+
+ /* This match is being called from an invalid chain */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, u16 family, const char *name,
+ int rev, int target)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+ nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+ nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = 0, target;
+ struct nfgenmsg *nfmsg;
+ const char *fmt;
+ const char *name;
+ u32 rev;
+ struct sk_buff *skb2;
+
+ if (tb[NFTA_COMPAT_NAME] == NULL ||
+ tb[NFTA_COMPAT_REV] == NULL ||
+ tb[NFTA_COMPAT_TYPE] == NULL)
+ return -EINVAL;
+
+ name = nla_data(tb[NFTA_COMPAT_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+ target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+ nfmsg = nlmsg_data(nlh);
+
+ switch(nfmsg->nfgen_family) {
+ case AF_INET:
+ fmt = "ipt_%s";
+ break;
+ case AF_INET6:
+ fmt = "ip6t_%s";
+ break;
+ default:
+ pr_err("nft_compat: unsupported protocol %d\n",
+ nfmsg->nfgen_family);
+ return -EINVAL;
+ }
+
+ try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+ rev, target, &ret),
+ fmt, name);
+
+ if (ret < 0)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ /* include the best revision for this extension in the message */
+ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_COMPAT_GET,
+ nfmsg->nfgen_family,
+ name, ret, target) <= 0) {
+ kfree_skb(skb2);
+ return -ENOSPC;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+ [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
+ .len = NFT_COMPAT_NAME_MAX-1 },
+ [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+ [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+ .name = "nft-compat",
+ .subsys_id = NFNL_SUBSYS_NFT_COMPAT,
+ .cb_count = NFNL_MSG_COMPAT_MAX,
+ .cb = nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+ struct list_head head;
+ struct nft_expr_ops ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_match;
+ struct xt_match *match;
+ char *mt_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_MATCH_NAME] == NULL ||
+ tb[NFTA_MATCH_REV] == NULL ||
+ tb[NFTA_MATCH_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing match if it's already loaded. */
+ list_for_each_entry(nft_match, &nft_match_list, head) {
+ struct xt_match *match = nft_match->ops.data;
+
+ if (strcmp(match->name, mt_name) == 0 &&
+ match->revision == rev && match->family == family)
+ return &nft_match->ops;
+ }
+
+ match = xt_request_find_match(family, mt_name, rev);
+ if (IS_ERR(match))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this match, allocate operations */
+ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_match == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_match->ops.type = &nft_match_type;
+ nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) +
+ nft_compat_match_offset(match));
+ nft_match->ops.eval = nft_match_eval;
+ nft_match->ops.init = nft_match_init;
+ nft_match->ops.destroy = nft_match_destroy;
+ nft_match->ops.dump = nft_match_dump;
+ nft_match->ops.validate = nft_match_validate;
+ nft_match->ops.data = match;
+
+ list_add(&nft_match->head, &nft_match_list);
+
+ return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+ struct nft_xt *nft_match, *tmp;
+
+ list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
+ kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+ .name = "match",
+ .select_ops = nft_match_select_ops,
+ .policy = nft_match_policy,
+ .maxattr = NFTA_MATCH_MAX,
+ .owner = THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_target;
+ struct xt_target *target;
+ char *tg_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_TARGET_NAME] == NULL ||
+ tb[NFTA_TARGET_REV] == NULL ||
+ tb[NFTA_TARGET_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing target if it's already loaded. */
+ list_for_each_entry(nft_target, &nft_match_list, head) {
+ struct xt_target *target = nft_target->ops.data;
+
+ if (strcmp(target->name, tg_name) == 0 &&
+ target->revision == rev && target->family == family)
+ return &nft_target->ops;
+ }
+
+ target = xt_request_find_target(family, tg_name, rev);
+ if (IS_ERR(target))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this target, allocate operations */
+ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_target == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_target->ops.type = &nft_target_type;
+ nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) +
+ nft_compat_target_offset(target));
+ nft_target->ops.eval = nft_target_eval;
+ nft_target->ops.init = nft_target_init;
+ nft_target->ops.destroy = nft_target_destroy;
+ nft_target->ops.dump = nft_target_dump;
+ nft_target->ops.validate = nft_target_validate;
+ nft_target->ops.data = target;
+
+ list_add(&nft_target->head, &nft_target_list);
+
+ return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+ struct nft_xt *nft_target, *tmp;
+
+ list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
+ kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+ .name = "target",
+ .select_ops = nft_target_select_ops,
+ .policy = nft_target_policy,
+ .maxattr = NFTA_TARGET_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_match_type);
+ if (ret < 0)
+ return ret;
+
+ ret = nft_register_expr(&nft_target_type);
+ if (ret < 0)
+ goto err_match;
+
+ ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+ if (ret < 0) {
+ pr_err("nft_compat: cannot register with nfnetlink.\n");
+ goto err_target;
+ }
+
+ pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+ return ret;
+
+err_target:
+ nft_unregister_expr(&nft_target_type);
+err_match:
+ nft_unregister_expr(&nft_match_type);
+ return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+ nft_unregister_expr(&nft_target_type);
+ nft_unregister_expr(&nft_match_type);
+ nft_match_release();
+ nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
new file mode 100644
index 00000000000..c89ee486ce5
--- /dev/null
+++ b/net/netfilter/nft_counter.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_counter {
+ seqlock_t lock;
+ u64 bytes;
+ u64 packets;
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ write_seqlock_bh(&priv->lock);
+ priv->bytes += pkt->skb->len;
+ priv->packets++;
+ write_sequnlock_bh(&priv->lock);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+ unsigned int seq;
+ u64 bytes;
+ u64 packets;
+
+ do {
+ seq = read_seqbegin(&priv->lock);
+ bytes = priv->bytes;
+ packets = priv->packets;
+ } while (read_seqretry(&priv->lock, seq));
+
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static int nft_counter_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_COUNTER_PACKETS])
+ priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ if (tb[NFTA_COUNTER_BYTES])
+ priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+
+ seqlock_init(&priv->lock);
+ return 0;
+}
+
+static struct nft_expr_type nft_counter_type;
+static const struct nft_expr_ops nft_counter_ops = {
+ .type = &nft_counter_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+ .eval = nft_counter_eval,
+ .init = nft_counter_init,
+ .dump = nft_counter_dump,
+};
+
+static struct nft_expr_type nft_counter_type __read_mostly = {
+ .name = "counter",
+ .ops = &nft_counter_ops,
+ .policy = nft_counter_policy,
+ .maxattr = NFTA_COUNTER_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_counter_module_init(void)
+{
+ return nft_register_expr(&nft_counter_type);
+}
+
+static void __exit nft_counter_module_exit(void)
+{
+ nft_unregister_expr(&nft_counter_type);
+}
+
+module_init(nft_counter_module_init);
+module_exit(nft_counter_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("counter");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
new file mode 100644
index 00000000000..cc560301624
--- /dev/null
+++ b/net/netfilter/nft_ct.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+struct nft_ct {
+ enum nft_ct_keys key:8;
+ enum ip_conntrack_dir dir:8;
+ union {
+ enum nft_registers dreg:8;
+ enum nft_registers sreg:8;
+ };
+};
+
+static void nft_ct_get_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct nft_data *dest = &data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_tuple *tuple;
+ const struct nf_conntrack_helper *helper;
+ long diff;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct == NULL)
+ state = NF_CT_STATE_INVALID_BIT;
+ else if (nf_ct_is_untracked(ct))
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_BIT(ctinfo);
+ dest->data[0] = state;
+ return;
+ }
+
+ if (ct == NULL)
+ goto err;
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ dest->data[0] = CTINFO2DIR(ctinfo);
+ return;
+ case NFT_CT_STATUS:
+ dest->data[0] = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ dest->data[0] = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ dest->data[0] = ct->secmark;
+ return;
+#endif
+ case NFT_CT_EXPIRATION:
+ diff = (long)jiffies - (long)ct->timeout.expires;
+ if (diff < 0)
+ diff = 0;
+ dest->data[0] = jiffies_to_msecs(diff);
+ return;
+ case NFT_CT_HELPER:
+ if (ct->master == NULL)
+ goto err;
+ help = nfct_help(ct->master);
+ if (help == NULL)
+ goto err;
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ goto err;
+ if (strlen(helper->name) >= sizeof(dest->data))
+ goto err;
+ strncpy((char *)dest->data, helper->name, sizeof(dest->data));
+ return;
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS: {
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int size;
+
+ if (!labels) {
+ memset(dest->data, 0, sizeof(dest->data));
+ return;
+ }
+
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data));
+ size = labels->words * sizeof(long);
+
+ memcpy(dest->data, labels->bits, size);
+ if (size < sizeof(dest->data))
+ memset(((char *) dest->data) + size, 0,
+ sizeof(dest->data) - size);
+ return;
+ }
+#endif
+ }
+
+ tuple = &ct->tuplehash[priv->dir].tuple;
+ switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
+ dest->data[0] = nf_ct_l3num(ct);
+ return;
+ case NFT_CT_SRC:
+ memcpy(dest->data, tuple->src.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_DST:
+ memcpy(dest->data, tuple->dst.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_PROTOCOL:
+ dest->data[0] = nf_ct_protonum(ct);
+ return;
+ case NFT_CT_PROTO_SRC:
+ dest->data[0] = (__force __u16)tuple->src.u.all;
+ return;
+ case NFT_CT_PROTO_DST:
+ dest->data[0] = (__force __u16)tuple->dst.u.all;
+ return;
+ }
+ return;
+err:
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static void nft_ct_set_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ u32 value = data[priv->sreg].data[0];
+#endif
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ if (ct->mark != value) {
+ ct->mark = value;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+#endif
+ }
+}
+
+static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
+ [NFTA_CT_DREG] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_DIRECTION] = { .type = NLA_U8 },
+ [NFTA_CT_SREG] = { .type = NLA_U32 },
+};
+
+static int nft_ct_l3proto_try_module_get(uint8_t family)
+{
+ int err;
+
+ if (family == NFPROTO_INET) {
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_l3proto_try_module_get(family);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+err1:
+ return err;
+}
+
+static void nft_ct_l3proto_module_put(uint8_t family)
+{
+ if (family == NFPROTO_INET) {
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ } else
+ nf_ct_l3proto_module_put(family);
+}
+
+static int nft_ct_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ case NFT_CT_DIRECTION:
+ case NFT_CT_STATUS:
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+#endif
+ case NFT_CT_EXPIRATION:
+ case NFT_CT_HELPER:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ break;
+ case NFT_CT_L3PROTOCOL:
+ case NFT_CT_PROTOCOL:
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ return -EINVAL;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[NFTA_CT_DIRECTION] != NULL) {
+ priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+ switch (priv->dir) {
+ case IP_CT_DIR_ORIGINAL:
+ case IP_CT_DIR_REPLY:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG]));
+ err = nft_validate_output_register(priv->dreg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_ct_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ break;
+#endif
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG]));
+ err = nft_validate_input_register(priv->sreg);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void nft_ct_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_ct_l3proto_module_put(ctx->afi->family);
+}
+
+static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+
+ switch (priv->key) {
+ case NFT_CT_PROTOCOL:
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+ goto nla_put_failure;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_ct_type;
+static const struct nft_expr_ops nft_ct_get_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_get_dump,
+};
+
+static const struct nft_expr_ops nft_ct_set_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_set_eval,
+ .init = nft_ct_set_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ct_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_CT_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG])
+ return &nft_ct_get_ops;
+
+ if (tb[NFTA_CT_SREG])
+ return &nft_ct_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ct_type __read_mostly = {
+ .name = "ct",
+ .select_ops = &nft_ct_select_ops,
+ .policy = nft_ct_policy,
+ .maxattr = NFTA_CT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_ct_module_init(void)
+{
+ return nft_register_expr(&nft_ct_type);
+}
+
+static void __exit nft_ct_module_exit(void)
+{
+ nft_unregister_expr(&nft_ct_type);
+}
+
+module_init(nft_ct_module_init);
+module_exit(nft_ct_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("ct");
diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c
new file mode 100644
index 00000000000..b6eed4d5a09
--- /dev/null
+++ b/net/netfilter/nft_expr_template.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_template {
+
+};
+
+static void nft_template_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = {
+ [NFTA_TEMPLATE_ATTR] = { .type = NLA_U32 },
+};
+
+static int nft_template_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+ return 0;
+}
+
+static void nft_template_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_template *priv = nft_expr_priv(expr);
+
+}
+
+static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_template *priv = nft_expr_priv(expr);
+
+ NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_template_type;
+static const struct nft_expr_ops nft_template_ops = {
+ .type = &nft_template_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_template)),
+ .eval = nft_template_eval,
+ .init = nft_template_init,
+ .destroy = nft_template_destroy,
+ .dump = nft_template_dump,
+};
+
+static struct nft_expr_type nft_template_type __read_mostly = {
+ .name = "template",
+ .ops = &nft_template_ops,
+ .policy = nft_template_policy,
+ .maxattr = NFTA_TEMPLATE_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_template_module_init(void)
+{
+ return nft_register_expr(&nft_template_type);
+}
+
+static void __exit nft_template_module_exit(void)
+{
+ nft_unregister_expr(&nft_template_type);
+}
+
+module_init(nft_template_module_init);
+module_exit(nft_template_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("template");
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
new file mode 100644