diff options
Diffstat (limited to 'net/ipv4')
145 files changed, 3834 insertions, 19552 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9e8ef509c51..e62aee0ec4c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -574,6 +574,33 @@ config TCP_CONG_VENO loss packets. See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf +config TCP_CONG_YEAH + tristate "YeAH TCP" + depends on EXPERIMENTAL + default n + ---help--- + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. + + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + +config TCP_CONG_ILLINOIS + tristate "TCP Illinois" + depends on EXPERIMENTAL + default n + ---help--- + TCP-Illinois is a sender-side modificatio of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. + + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7a068626fee..4ff6c151d7f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -49,6 +49,8 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o +obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf358c84c44..16aae8ef555 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -87,6 +87,7 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/netfilter_ipv4.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -217,6 +218,26 @@ out: return err; } +u32 inet_ehash_secret __read_mostly; +EXPORT_SYMBOL(inet_ehash_secret); + +/* + * inet_ehash_secret must be set exactly once + * Instead of using a dedicated spinlock, we (ab)use inetsw_lock + */ +void build_ehash_secret(void) +{ + u32 rnd; + do { + get_random_bytes(&rnd, sizeof(rnd)); + } while (rnd == 0); + spin_lock_bh(&inetsw_lock); + if (!inet_ehash_secret) + inet_ehash_secret = rnd; + spin_unlock_bh(&inetsw_lock); +} +EXPORT_SYMBOL(build_ehash_secret); + /* * Create an inet socket. */ @@ -233,6 +254,11 @@ static int inet_create(struct socket *sock, int protocol) int try_loading_module = 0; int err; + if (sock->type != SOCK_RAW && + sock->type != SOCK_DGRAM && + !inet_ehash_secret) + build_ehash_secret(); + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -755,6 +781,9 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: err = sock_get_timestamp(sk, (struct timeval __user *)arg); break; + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *)arg); + break; case SIOCADDRT: case SIOCDELRT: case SIOCRTMSG: @@ -1109,7 +1138,7 @@ static int inet_gso_send_check(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; @@ -1117,8 +1146,9 @@ static int inet_gso_send_check(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - skb->h.raw = __skb_pull(skb, ihl); - iph = skb->nh.iph; + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); err = -EPROTONOSUPPORT; @@ -1152,7 +1182,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; @@ -1160,8 +1190,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - skb->h.raw = __skb_pull(skb, ihl); - iph = skb->nh.iph; + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); id = ntohs(iph->id); proto = iph->protocol & (MAX_INET_PROTOS - 1); segs = ERR_PTR(-EPROTONOSUPPORT); @@ -1177,17 +1208,57 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) skb = segs; do { - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->id = htons(id++); iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; - iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); } while ((skb = skb->next)); out: return segs; } +unsigned long snmp_fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for_each_possible_cpu(i) { + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} +EXPORT_SYMBOL_GPL(snmp_fold_field); + +int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +{ + BUG_ON(ptr == NULL); + ptr[0] = __alloc_percpu(mibsize); + if (!ptr[0]) + goto err0; + ptr[1] = __alloc_percpu(mibsize); + if (!ptr[1]) + goto err1; + return 0; +err1: + free_percpu(ptr[0]); + ptr[0] = NULL; +err0: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(snmp_mib_init); + +void snmp_mib_free(void *ptr[2]) +{ + BUG_ON(ptr == NULL); + free_percpu(ptr[0]); + free_percpu(ptr[1]); + ptr[0] = ptr[1] = NULL; +} +EXPORT_SYMBOL_GPL(snmp_mib_free); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1214,28 +1285,47 @@ static struct net_protocol icmp_protocol = { static int __init init_ipv4_mibs(void) { - net_statistics[0] = alloc_percpu(struct linux_mib); - net_statistics[1] = alloc_percpu(struct linux_mib); - ip_statistics[0] = alloc_percpu(struct ipstats_mib); - ip_statistics[1] = alloc_percpu(struct ipstats_mib); - icmp_statistics[0] = alloc_percpu(struct icmp_mib); - icmp_statistics[1] = alloc_percpu(struct icmp_mib); - tcp_statistics[0] = alloc_percpu(struct tcp_mib); - tcp_statistics[1] = alloc_percpu(struct tcp_mib); - udp_statistics[0] = alloc_percpu(struct udp_mib); - udp_statistics[1] = alloc_percpu(struct udp_mib); - udplite_statistics[0] = alloc_percpu(struct udp_mib); - udplite_statistics[1] = alloc_percpu(struct udp_mib); - if (! - (net_statistics[0] && net_statistics[1] && ip_statistics[0] - && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] - && udp_statistics[0] && udp_statistics[1] - && udplite_statistics[0] && udplite_statistics[1] ) ) - return -ENOMEM; - - (void) tcp_mib_init(); + if (snmp_mib_init((void **)net_statistics, + sizeof(struct linux_mib), + __alignof__(struct linux_mib)) < 0) + goto err_net_mib; + if (snmp_mib_init((void **)ip_statistics, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void **)icmp_statistics, + sizeof(struct icmp_mib), + __alignof__(struct icmp_mib)) < 0) + goto err_icmp_mib; + if (snmp_mib_init((void **)tcp_statistics, + sizeof(struct tcp_mib), + __alignof__(struct tcp_mib)) < 0) + goto err_tcp_mib; + if (snmp_mib_init((void **)udp_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + if (snmp_mib_init((void **)udplite_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udplite_mib; + + tcp_mib_init(); return 0; + +err_udplite_mib: + snmp_mib_free((void **)udp_statistics); +err_udp_mib: + snmp_mib_free((void **)tcp_statistics); +err_tcp_mib: + snmp_mib_free((void **)icmp_statistics); +err_icmp_mib: + snmp_mib_free((void **)ip_statistics); +err_ip_mib: + snmp_mib_free((void **)net_statistics); +err_net_mib: + return -ENOMEM; } static int ipv4_proc_init(void); @@ -1336,7 +1426,7 @@ static int __init inet_init(void) * Initialise per-cpu ipv4 mibs */ - if(init_ipv4_mibs()) + if (init_ipv4_mibs()) printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; ipv4_proc_init(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 7194eb40b6d..6da8ff597ad 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -65,7 +65,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) char buf[60]; } tmp_iph; - top_iph = skb->nh.iph; + top_iph = ip_hdr(skb); iph = &tmp_iph.iph; iph->tos = top_iph->tos; @@ -152,9 +152,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; ah = (struct ip_auth_hdr*)skb->data; - iph = skb->nh.iph; + iph = ip_hdr(skb); - ihl = skb->data - skb->nh.raw; + ihl = skb->data - skb_network_header(skb); memcpy(work_buf, iph, ihl); iph->ttl = 0; @@ -181,7 +181,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_buf, ihl); + skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + ihl); return 0; @@ -196,8 +198,8 @@ static void ah4_err(struct sk_buff *skb, u32 info) struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a3488a83f4..7110779a024 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -342,13 +342,13 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) - saddr = skb->nh.iph->saddr; + if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL) + saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; - saddr = skb->nh.iph->saddr; + saddr = ip_hdr(skb)->saddr; if (inet_addr_type(saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) @@ -578,7 +578,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, return NULL; skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); @@ -721,7 +721,7 @@ static int arp_process(struct sk_buff *skb) if (in_dev == NULL) goto out; - arp = skb->nh.arph; + arp = arp_hdr(skb); switch (dev_type) { default: @@ -937,7 +937,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, (2 * sizeof(u32))))) goto freeskb; - arp = skb->nh.arph; + arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || @@ -1178,7 +1178,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) goto out; } - switch(cmd) { + switch (cmd) { case SIOCDARP: err = arp_req_delete(&r, dev); break; @@ -1360,7 +1360,7 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ -static struct seq_operations arp_seq_ops = { +static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2ce5b693a8b..e1f18489db1 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0; int cipso_v4_rbm_strictvalid = 1; /* + * Protocol Constants + */ + +/* Maximum size of the CIPSO IP option, derived from the fact that the maximum + * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ +#define CIPSO_V4_OPT_LEN_MAX 40 + +/* Length of the base CIPSO option, this includes the option type (1 byte), the + * option length (1 byte), and the DOI (4 bytes). */ +#define CIPSO_V4_HDR_LEN 6 + +/* Base length of the restrictive category bitmap tag (tag #1). */ +#define CIPSO_V4_TAG_RBM_BLEN 4 + +/* Base length of the enumerated category tag (tag #2). */ +#define CIPSO_V4_TAG_ENUM_BLEN 4 + +/* Base length of the ranged categories bitmap tag (tag #5). */ +#define CIPSO_V4_TAG_RNG_BLEN 4 +/* The maximum number of category ranges permitted in the ranged category tag + * (tag #5). You may note that the IETF draft states that the maximum number + * of category ranges is 7, but if the low end of the last category range is + * zero then it is possibile to fit 8 category ranges because the zero should + * be omitted. */ +#define CIPSO_V4_TAG_RNG_CAT_MAX 8 + +/* * Helper Functions */ @@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, unsigned char *net_cat, u32 net_cat_len) { - /* The constant '16' is not random, it is the maximum number of - * high/low category range pairs as permitted by the CIPSO draft based - * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion - * does a sanity check to make sure we don't overflow the array. */ int iter = -1; - u16 array[16]; + u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; - BUG_ON(net_cat_len > 30); + /* make sure we don't overflow the 'array[]' variable */ + if (net_cat_len > + (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) + return -ENOSPC; for (;;) { iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); @@ -1174,7 +1200,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, u16 cat_low; u16 cat_high; - for(net_iter = 0; net_iter < net_cat_len; net_iter += 4) { + for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = ntohs(*((__be16 *)&net_cat[net_iter])); if ((net_iter + 4) <= net_cat_len) cat_low = ntohs(*((__be16 *)&net_cat[net_iter + 2])); @@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, * Protocol Handling Functions */ -#define CIPSO_V4_OPT_LEN_MAX 40 -#define CIPSO_V4_HDR_LEN 6 - /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition @@ -1676,7 +1699,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - if (skb->nh.iph->protocol == IPPROTO_ICMP || error != -EACCES) + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; if (gateway) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 98a00d0edc7..088888db8b3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -48,7 +48,6 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> @@ -62,7 +61,7 @@ #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> -#include <net/netlink.h> +#include <net/rtnetlink.h> struct ipv4_devconf ipv4_devconf = { .accept_redirects = 1, @@ -633,7 +632,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) dev_load(ifr.ifr_name); #endif - switch(cmd) { + switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ @@ -708,7 +707,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; - switch(cmd) { + switch (cmd) { case SIOCGIFADDR: /* Get interface address */ sin->sin_addr.s_addr = ifa->ifa_local; goto rarok; @@ -1183,17 +1182,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) int s_ip_idx, s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; - read_lock(&dev_base_lock); for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; if (idx > s_idx) s_ip_idx = 0; - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { - rcu_read_unlock(); + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) continue; - } for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { @@ -1201,16 +1196,12 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) continue; if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) { - rcu_read_unlock(); + RTM_NEWADDR, NLM_F_MULTI) <= 0) goto done; - } } - rcu_read_unlock(); } done: - read_unlock(&dev_base_lock); cb->args[0] = idx; cb->args[1] = ip_idx; @@ -1241,19 +1232,6 @@ errout: rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err); } -static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = { - [RTM_NEWADDR - RTM_BASE] = { .doit = inet_rtm_newaddr, }, - [RTM_DELADDR - RTM_BASE] = { .doit = inet_rtm_deladdr, }, - [RTM_GETADDR - RTM_BASE] = { .dumpit = inet_dump_ifaddr, }, - [RTM_NEWROUTE - RTM_BASE] = { .doit = inet_rtm_newroute, }, - [RTM_DELROUTE - RTM_BASE] = { .doit = inet_rtm_delroute, }, - [RTM_GETROUTE - RTM_BASE] = { .doit = inet_rtm_getroute, - .dumpit = inet_dump_fib, }, -#ifdef CONFIG_IP_MULTIPLE_TABLES - [RTM_GETRULE - RTM_BASE] = { .dumpit = fib4_rules_dump, }, -#endif -}; - #ifdef CONFIG_SYSCTL void inet_forward_change(void) @@ -1636,7 +1614,10 @@ void __init devinet_init(void) { register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - rtnetlink_links[PF_INET] = inet_rtnetlink_table; + + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); #ifdef CONFIG_SYSCTL devinet_sysctl.sysctl_header = register_sysctl_table(devinet_sysctl.devinet_root_dir); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 31041127eeb..47c95e8ef04 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -21,13 +21,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct blkcipher_desc desc; struct esp_data *esp; struct sk_buff *trailer; + u8 *tail; int blksize; int clen; int alen; int nfrags; /* Strip IP+ESP header. */ - __skb_pull(skb, skb->h.raw - skb->data); + __skb_pull(skb, skb_transport_offset(skb)); /* Now skb is pure payload to encrypt */ err = -ENOMEM; @@ -49,19 +50,21 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) goto error; /* Fill padding... */ + tail = skb_tail_pointer(trailer); do { int i; for (i=0; i<clen-skb->len - 2; i++) - *(u8*)(trailer->tail + i) = i+1; + tail[i] = i + 1; } while (0); - *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + tail[clen - skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); - __skb_push(skb, skb->data - skb->nh.raw); - top_iph = skb->nh.iph; - esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); + __skb_push(skb, skb->data - skb_network_header(skb)); + top_iph = ip_hdr(skb); + esph = (struct ip_esp_hdr *)(skb_network_header(skb) + + top_iph->ihl * 4); top_iph->tot_len = htons(skb->len + alen); - *(u8*)(trailer->tail - 1) = top_iph->protocol; + *(skb_tail_pointer(trailer) - 1) = top_iph->protocol; /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -217,12 +220,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh = (void *)(skb->nh.raw + ihl); + struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -260,7 +263,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; + __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen); + skb_set_transport_header(skb, -ihl); return 0; @@ -268,32 +272,33 @@ out: return -EINVAL; } -static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) +static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) { struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); - int enclen = 0; + u32 align = max_t(u32, blksize, esp->conf.padlen); + u32 rem; + + mtu -= x->props.header_len + esp->auth.icv_trunc_len; + rem = mtu & (align - 1); + mtu &= ~(align - 1); switch (x->props.mode) { case XFRM_MODE_TUNNEL: - mtu = ALIGN(mtu +2, blksize); break; default: case XFRM_MODE_TRANSPORT: /* The worst case */ - mtu = ALIGN(mtu + 2, 4) + blksize - 4; + mtu -= blksize - 4; + mtu += min_t(u32, blksize - 4, rem); break; case XFRM_MODE_BEET: /* The worst case. */ - enclen = IPV4_BEET_PHMAXLEN; - mtu = ALIGN(mtu + enclen + 2, blksize); + mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem); break; } - if (esp->conf.padlen) - mtu = ALIGN(mtu, esp->conf.padlen); - - return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen; + return mtu - 2; } static void esp4_err(struct sk_buff *skb, u32 info) @@ -302,8 +307,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); @@ -336,6 +341,7 @@ static int esp_init_state(struct xfrm_state *x) { struct esp_data *esp = NULL; struct crypto_blkcipher *tfm; + u32 align; /* null auth and encryption can have zero length keys */ if (x->aalg) { @@ -402,6 +408,8 @@ static int esp_init_state(struct xfrm_state *x) x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); + else if (x->props.mode == XFRM_MODE_BEET) + x->props.header_len += IPV4_BEET_PHMAXLEN; if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; @@ -417,7 +425,10 @@ static int esp_init_state(struct xfrm_state *x) } } x->data = esp; - x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; + align = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); + if (esp->conf.padlen) + align = max_t(u32, align, esp->conf.padlen); + x->props.trailer_len = align + 1 + esp->auth.icv_trunc_len; return 0; error: @@ -434,7 +445,7 @@ static struct xfrm_type esp_type = .proto = IPPROTO_ESP, .init_state = esp_init_state, .destructor = esp_destroy, - .get_max_size = esp4_get_max_size, + .get_mtu = esp4_get_mtu, .input = esp_input, .output = esp_output }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1fba6439fc5..837f2957fa8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -34,7 +34,6 @@ #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> @@ -46,6 +45,7 @@ #include <net/icmp.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/rtnetlink.h> #define FFprint(a...) printk(KERN_DEBUG a) @@ -493,6 +493,11 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; cfg->fc_nlinfo.nlh = nlh; + if (cfg->fc_type > RTN_MAX) { + err = -EINVAL; + goto errout; + } + nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (attr->nla_type) { case RTA_DST: @@ -535,7 +540,7 @@ errout: return err; } -int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_config cfg; struct fib_table *tb; @@ -556,7 +561,7 @@ errout: return err; } -int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_config cfg; struct fib_table *tb; @@ -577,7 +582,7 @@ errout: return err; } -int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int h, s_h; unsigned int e = 0, s_e; @@ -771,6 +776,12 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) .nl_u = { .ip4_u = { .daddr = frn->fl_addr, .tos = frn->fl_tos, .scope = frn->fl_scope } } }; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + frn->err = -ENOENT; if (tb) { local_bh_disable(); @@ -782,6 +793,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; + fib_res_put(&res); } local_bh_enable(); } @@ -796,7 +808,10 @@ static void nl_fib_input(struct sock *sk, int len) struct fib_table *tb; skb = skb_dequeue(&sk->sk_receive_queue); - nlh = (struct nlmsghdr *)skb->data; + if (skb == NULL) + return; + + nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { kfree_skb(skb); @@ -808,7 +823,7 @@ static void nl_fib_input(struct sock *sk, int len) nl_fib_lookup(frn, tb); - pid = nlh->nlmsg_pid; /*pid of sending process */ + pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(sk, skb, pid, MSG_DONTWAIT); @@ -816,7 +831,8 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); + netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL, + THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) @@ -914,6 +930,10 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); nl_fib_lookup_init(); + + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); } EXPORT_SYMBOL(inet_addr_type); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index a4949f957ab..9cfecf1215c 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -1027,7 +1027,7 @@ out: return 0; } -static struct seq_operations fib_seq_ops = { +static const struct seq_operations fib_seq_ops = { .start = fib_seq_start, .next = fib_seq_next, .stop = fib_seq_stop, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b837c33e040..33083ad52e9 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -171,8 +171,6 @@ static struct fib_table *fib_empty_table(void) static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = { FRA_GENERIC_POLICY, - [FRA_SRC] = { .type = NLA_U32 }, - [FRA_DST] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, }; @@ -183,8 +181,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; - if (frh->src_len > 32 || frh->dst_len > 32 || - (frh->tos & ~IPTOS_TOS_MASK)) + if (frh->tos & ~IPTOS_TOS_MASK) goto errout; if (rule->table == RT_TABLE_UNSPEC) { @@ -201,10 +198,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } } - if (tb[FRA_SRC]) + if (frh->src_len) rule4->src = nla_get_be32(tb[FRA_SRC]); - if (tb[FRA_DST]) + if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_NET_CLS_ROUTE @@ -242,10 +239,10 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; #endif - if (tb[FRA_SRC] && (rule4->src != nla_get_be32(tb[FRA_SRC]))) + if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC]))) return 0; - if (tb[FRA_DST] && (rule4->dst != nla_get_be32(tb[FRA_DST]))) + if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST]))) return 0; return 1; @@ -277,11 +274,6 @@ nla_put_failure: return -ENOBUFS; } -int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return fib_rules_dump(skb, cb, AF_INET); -} - static u32 fib4_rule_default_pref(void) { struct list_head *pos; @@ -306,9 +298,15 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(4); /* flow */ } +static void fib4_rule_flush_cache(void) +{ + rt_cache_flush(-1); +} + static struct fib_rules_ops fib4_rules_ops = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), + .addr_size = sizeof(u32), .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, @@ -316,6 +314,7 @@ static struct fib_rules_ops fib4_rules_ops = { .fill = fib4_rule_fill, .default_pref = fib4_rule_default_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, + .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, .rules_list = &fib4_rules, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2f1fdae6efa..406ea7050ae 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -89,7 +89,7 @@ static const struct { int error; u8 scope; -} fib_props[RTA_MAX + 1] = { +} fib_props[RTN_MAX + 1] = { { .error = 0, .scope = RT_SCOPE_NOWHERE, @@ -927,7 +927,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, default: printk(KERN_DEBUG "impossible 102\n"); return -EINVAL; - }; + } } return err; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ada9b3db507..9be7da7c3a8 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -50,7 +50,7 @@ * Patrick McHardy <kaber@trash.net> */ -#define VERSION "0.407" +#define VERSION "0.408" #include <asm/uaccess.h> #include <asm/system.h> @@ -292,8 +292,8 @@ static inline void check_tnode(const struct tnode *tn) static int halve_threshold = 25; static int inflate_threshold = 50; -static int halve_threshold_root = 15; -static int inflate_threshold_root = 25; +static int halve_threshold_root = 8; +static int inflate_threshold_root = 15; static void __alias_free_mem(struct rcu_head *head) @@ -350,11 +350,10 @@ static void __tnode_free_rcu(struct rcu_head *head) static inline void tnode_free(struct tnode *tn) { - if(IS_LEAF(tn)) { + if (IS_LEAF(tn)) { struct leaf *l = (struct leaf *) tn; call_rcu_bh(&l->rcu, __leaf_free_rcu); - } - else + } else call_rcu(&tn->rcu, __tnode_free_rcu); } @@ -459,6 +458,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; + int max_resize; if (!tn) return NULL; @@ -553,13 +553,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if(!tn->parent) + if (!tn->parent) inflate_threshold_use = inflate_threshold_root; else inflate_threshold_use = inflate_threshold; err = 0; - while ((tn->full_children > 0 && + max_resize = 10; + while ((tn->full_children > 0 && max_resize-- && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold_use * tnode_child_length(tn))) { @@ -574,6 +575,15 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } + if (max_resize < 0) { + if (!tn->parent) + printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", + inflate_threshold_root, tn->bits); + else + printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", + inflate_threshold, tn->bits); + } + check_tnode(tn); /* @@ -584,13 +594,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if(!tn->parent) + if (!tn->parent) halve_threshold_use = halve_threshold_root; else halve_threshold_use = halve_threshold; err = 0; - while (tn->bits > 1 && + max_resize = 10; + while (tn->bits > 1 && max_resize-- && 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold_use * tnode_child_length(tn)) { @@ -605,6 +616,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } + if (max_resize < 0) { + if (!tn->parent) + printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", + halve_threshold_root, tn->bits); + else + printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", + halve_threshold, tn->bits); + } /* Only one child remains */ if (tn->empty_children == tnode_child_length(tn) - 1) @@ -1123,6 +1142,9 @@ err: return fa_head; } +/* + * Caller must hold RTNL. + */ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; @@ -1540,6 +1562,9 @@ static int trie_leaf_remove(struct trie *t, t_key key) return 1; } +/* + * Caller must hold RTNL. + */ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; @@ -1718,6 +1743,9 @@ up: return NULL; /* Ready. Root of trie */ } +/* + * Caller must hold RTNL. + */ static int fn_trie_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; @@ -2030,12 +2058,12 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, { struct node *n ; - if(!t) + if (!t) return NULL; n = rcu_dereference(t->trie); - if(!iter) + if (!iter) return NULL; if (n) { @@ -2075,7 +2103,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) int i; s->tnodes++; - if(tn->bits < MAX_STAT_DEPTH) + if (tn->bits < MAX_STAT_DEPTH) s->nodesizes[tn->bits]++; for (i = 0; i < (1<<tn->bits); i++) @@ -2241,7 +2269,7 @@ static inline const char *rtn_scope(enum rt_scope_t s) { static char buf[32]; - switch(s) { + switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; @@ -2331,7 +2359,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations fib_trie_seq_ops = { +static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, @@ -2452,7 +2480,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations fib_route_seq_ops = { +static const struct seq_operations fib_route_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4b7a0d946a0..d38cbba92a4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -355,7 +355,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, ipc, rt, MSG_DONTWAIT) < 0) ip_flush_pending_frames(icmp_socket->sk); else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { - struct icmphdr *icmph = skb->h.icmph; + struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; @@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; icmp_out_count(icmp_param->data.icmph.type); - inet->tos = skb->nh.iph->tos; + inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; if (icmp_param->replyopts.optlen) { @@ -404,7 +404,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) @@ -448,9 +448,10 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Check this, icmp_send is called from the most obscure devices * sometimes. */ - iph = skb_in->nh.iph; + iph = ip_hdr(skb_in); - if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail) + if ((u8 *)iph < skb_in->head || + (skb_in->network_header + sizeof(*iph)) > skb_in->tail) goto out; /* @@ -484,7 +485,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) u8 _inner_type, *itp; itp = skb_header_pointer(skb_in, - skb_in->nh.raw + + skb_network_header(skb_in) + (iph->ihl << 2) + offsetof(struct icmphdr, type) - @@ -536,7 +537,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.data.icmph.un.gateway = info; icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; - icmp_param.offset = skb_in->nh.raw - skb_in->data; + icmp_param.offset = skb_network_offset(skb_in); icmp_out_count(icmp_param.data.icmph.type); inet_sk(icmp_socket->sk)->tos = tos; ipc.addr = iph->saddr; @@ -613,7 +614,7 @@ static void icmp_unreach(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out_err; - icmph = skb->h.icmph; + icmph = icmp_hdr(skb); iph = (struct iphdr *)skb->data; if (iph->ihl < 5) /* Mangled header, drop. */ @@ -676,7 +677,7 @@ static void icmp_unreach(struct sk_buff *skb) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " "type %u, code %u " "error to a broadcast: %u.%u.%u.%u on %s\n", - NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(ip_hdr(skb)->saddr), icmph->type, icmph->code, NIPQUAD(iph->daddr), skb->dev->name); @@ -743,7 +744,7 @@ static void icmp_redirect(struct sk_buff *skb) iph = (struct iphdr *)skb->data; - switch (skb->h.icmph->code & 7) { + switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: /* @@ -751,8 +752,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, - skb->h.icmph->un.gateway, + ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, + icmp_hdr(skb)->un.gateway, iph->saddr, skb->dev); break; } @@ -780,7 +781,7 @@ static void icmp_echo(struct sk_buff *skb) if (!sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; - icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_ECHOREPLY; icmp_param.skb = skb; icmp_param.offset = 0; @@ -816,7 +817,7 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); - icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; icmp_param.skb = skb; @@ -943,7 +944,7 @@ int icmp_rcv(struct sk_buff *skb) if (!pskb_pull(skb, sizeof(struct icmphdr))) goto error; - icmph = skb->h.icmph; + icmph = icmp_hdr(skb); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8cedb2a2c9d..2506021c293 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -314,7 +314,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + skb_reset_network_header(skb); + pip = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; @@ -331,8 +333,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) ((u8*)&pip[1])[2] = 0; ((u8*)&pip[1])[3] = 0; - pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig)); - skb->h.igmph = (struct igmphdr *)pig; + skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; + skb_put(skb, sizeof(*pig)); + pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; @@ -343,16 +346,14 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { - struct iphdr *pip = skb->nh.iph; - struct igmphdr *pig = skb->h.igmph; - int iplen, igmplen; + struct iphdr *pip = ip_hdr(skb); + struct igmphdr *pig = igmp_hdr(skb); + const int iplen = skb->tail - skb->network_header; + const int igmplen = skb->tail - skb->transport_header; - iplen = skb->tail - (unsigned char *)skb->nh.iph; pip->tot_len = htons(iplen); ip_send_check(pip); - - igmplen = skb->tail - (unsigned char *)skb->h.igmph; - pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); + pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, dst_output); @@ -379,7 +380,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; - pih = (struct igmpv3_report *)skb->h.igmph; + pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; @@ -412,7 +413,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (!*psf_list) goto empty_source; - pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL; + pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { @@ -664,7 +665,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; @@ -827,8 +830,8 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group) static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { - struct igmphdr *ih = skb->h.igmph; - struct igmpv3_query *ih3 = (struct igmpv3_query *)ih; + struct igmphdr *ih = igmp_hdr(skb); + struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; @@ -861,12 +864,12 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; - ih3 = (struct igmpv3_query *) skb->h.raw; + ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return; - ih3 = (struct igmpv3_query *) skb->h.raw; + ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); @@ -943,7 +946,7 @@ int igmp_rcv(struct sk_buff *skb) goto drop; } - ih = skb->h.igmph; + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: igmp_heard_query(in_dev, skb, len); @@ -2397,7 +2400,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations igmp_mc_seq_ops = { +static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, @@ -2571,7 +2574,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations igmp_mcf_seq_ops = { +static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5df71cd08da..dbeacd8b0f9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -27,6 +27,7 @@ #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/inet6_hashtables.h> +#include <net/netlink.h> #include <linux/inet.h> #include <linux/stddef.h> @@ -60,7 +61,7 @@ static int inet_csk_diag_fill(struct sock *sk, struct nlmsghdr *nlh; void *info = NULL; struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); const struct inet_diag_handler *handler; handler = inet_diag_table[unlh->nlmsg_type]; @@ -147,12 +148,12 @@ static int inet_csk_diag_fill(struct sock *sk, icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) icsk->icsk_ca_ops->get_info(sk, ext, skb); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -EMSGSIZE; } @@ -163,7 +164,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, { long tmo; struct inet_diag_msg *r; - const unsigned char *previous_tail = skb->tail; + const unsigned char *previous_tail = skb_tail_pointer(skb); struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); @@ -205,10 +206,10 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, &tw6->tw_v6_daddr); } #endif - nlh->nlmsg_len = skb->tail - previous_tail; + nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; return skb->len; nlmsg_failure: - skb_trim(skb, previous_tail - skb->data); + nlmsg_trim(skb, previous_tail); return -EMSGSIZE; } @@ -535,7 +536,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -574,12 +575,12 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, &inet6_rsk(req)->rmt_addr); } #endif - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -805,68 +806,43 @@ done: return skb->len; } -static inline int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; + int hdrlen = sizeof(struct inet_diag_req); - if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) - goto err_inval; + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || + nlmsg_len(nlh) < hdrlen) + return -EINVAL; if (inet_diag_table[nlh->nlmsg_type] == NULL) return -ENOENT; - if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) - goto err_inval; - - if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > - (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { - struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + - sizeof(struct inet_diag_req)); - if (rta->rta_type != INET_DIAG_REQ_BYTECODE || - rta->rta_len < 8 || - rta->rta_len > - (nlh->nlmsg_len - - NLMSG_SPACE(sizeof(struct inet_diag_req)))) - goto err_inval; - if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) - goto err_inval; + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (nlmsg_attrlen(nlh, hdrlen)) { + struct nlattr *attr; + + attr = nlmsg_find_attr(nlh, hdrlen, + INET_DIAG_REQ_BYTECODE); + if (attr == NULL || + nla_len(attr) < sizeof(struct inet_diag_bc_op) || + inet_diag_bc_audit(nla_data(attr), nla_len(attr))) + return -EINVAL; } + return netlink_dump_start(idiagnl, skb, nlh, inet_diag_dump, NULL); - } else - return inet_diag_get_exact(skb, nlh); - -err_inval: - return -EINVAL; -} - - -static inline void inet_diag_rcv_skb(struct sk_buff *skb) -{ - if (skb->len >= NLMSG_SPACE(0)) { - int err; - struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; - - if (nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return; - err = inet_diag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, err); } + + return inet_diag_get_exact(skb, nlh); } static void inet_diag_rcv(struct sock *sk, int len) { - struct sk_buff *skb; - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - inet_diag_rcv_skb(skb); - kfree_skb(skb); - } + do { + netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg); + } while (qlen); } static DEFINE_SPINLOCK(inet_diag_register_lock); @@ -917,7 +893,7 @@ static int __init inet_diag_init(void) goto out; idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, - THIS_MODULE); + NULL, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; err = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index db3ef96bdfd..2f44e612806 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -87,10 +87,12 @@ static DEFINE_RWLOCK(peer_pool_lock); static int peer_total; /* Exported for sysctl_net_ipv4. */ -int inet_peer_threshold = 65536 + 128; /* start to throw entries more +int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more * aggressively at this stage */ -int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ -int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ +int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ +int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +int inet_peer_gc_mintime __read_mostly = 10 * HZ; +int inet_peer_gc_maxtime __read_mostly = 120 * HZ; static struct inet_peer *inet_peer_unused_head; static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; @@ -99,9 +101,6 @@ static DEFINE_SPINLOCK(inet_peer_unused_lock); static void peer_check_expire(unsigned long dummy); static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); -/* Exported for sysctl_net_ipv4. */ -int inet_peer_gc_mintime = 10 * HZ, - inet_peer_gc_maxtime = 120 * HZ; /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -151,20 +150,27 @@ static void unlink_from_unused(struct inet_peer *p) spin_unlock_bh(&inet_peer_unused_lock); } -/* Called with local BH disabled and the pool lock held. */ -#define lookup(daddr) \ +/* + * Called with local BH disabled and the pool lock held. + * _stack is known to be NULL or not at compile time, + * so compiler will optimize the if (_stack) tests. + */ +#define lookup(_daddr,_stack) \ ({ \ struct inet_peer *u, **v; \ - stackptr = stack; \ - *stackptr++ = &peer_root; \ + if (_stack) { \ + stackptr = _stack; \ + *stackptr++ = &peer_root; \ + } \ for (u = peer_root; u != peer_avl_empty; ) { \ - if (daddr == u->v4daddr) \ + if (_daddr == u->v4daddr) \ break; \ - if ((__force __u32)daddr < (__force __u32)u->v4daddr) \ + if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ - *stackptr++ = v; \ + if (_stack) \ + *stackptr++ = v; \ u = *v; \ } \ u; \ @@ -288,7 +294,7 @@ static void unlink_from_pool(struct inet_peer *p) if (atomic_read(&p->refcnt) == 1) { struct inet_peer **stack[PEER_MAXDEPTH]; struct inet_peer ***stackptr, ***delp; - if (lookup(p->v4daddr) != p) + if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ if (p->avl_left == peer_avl_empty) { @@ -373,7 +379,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) /* Look up for the address quickly. */ read_lock_bh(&peer_pool_lock); - p = lookup(daddr); + p = lookup(daddr, NULL); if (p != peer_avl_empty) atomic_inc(&p->refcnt); read_unlock_bh(&peer_pool_lock); @@ -400,7 +406,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) write_lock_bh(&peer_pool_lock); /* Check if an entry has suddenly appeared. */ - p = lookup(daddr); + p = lookup(daddr, stack); if (p != peer_avl_empty) goto out_free; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 369e721c4ba..9cb04df0054 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,14 +67,14 @@ int ip_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; - skb->ip_summed = CHECKSUM_NONE; + skb_forward_csum(skb); /* * According to the RFC, we must first decrease the TTL field. If * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ - if (skb->nh.iph->ttl <= 1) + if (ip_hdr(skb)->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) @@ -85,10 +85,18 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; + if (unlikely(skb->len > dst_mtu(&rt->u.dst) && + (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->u.dst))); + goto drop; + } + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; - iph = skb->nh.iph; + iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b6f05538037..0231bdcb2ab 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -92,7 +92,7 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct timeval stamp; + ktime_t stamp; int iif; unsigned int rid; struct inet_peer *peer; @@ -184,7 +184,7 @@ static __inline__ struct ipq *frag_alloc_queue(void) { struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); - if(!qp) + if (!qp) return NULL; atomic_add(sizeof(struct ipq), &ip_frag_mem); return qp; @@ -321,11 +321,11 @@ static struct ipq *ip_frag_intern(struct ipq *qp_in) * promoted read lock to write lock. */ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { - if(qp->id == qp_in->id && - qp->saddr == qp_in->saddr && - qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol && - qp->user == qp_in->user) { + if (qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; @@ -398,11 +398,11 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) read_lock(&ipfrag_lock); hash = ipqhashfn(id, saddr, daddr, protocol); hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { - if(qp->id == id && - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && - qp->user == user) { + if (qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) { atomic_inc(&qp->refcnt); read_unlock(&ipfrag_lock); return qp; @@ -479,11 +479,11 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; } - offset = ntohs(skb->nh.iph->frag_off); + offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ - ihl = skb->nh.iph->ihl * 4; + ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - ihl; @@ -524,7 +524,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = NULL; - for(next = qp->fragments; next != NULL; next = next->next) { + for (next = qp->fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) break; /* bingo! */ prev = next; @@ -592,7 +592,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - skb_get_timestamp(skb, &qp->stamp); + qp->stamp = skb->tstamp; qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -624,10 +624,10 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) BUG_TRAP(FRAG_CB(head)->offset == 0); /* Allocate a new buffer for the datagram. */ - ihlen = head->nh.iph->ihl*4; + ihlen = ip_hdrlen(head); len = ihlen + qp->len; - if(len > 65535) + if (len > 65535) goto out_oversize; /* Head of list must not be cloned. */ @@ -658,7 +658,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) } skb_shinfo(head)->frag_list = head->next; - skb_push(head, head->data - head->nh.raw); + skb_push(head, head->data - skb_network_header(head)); atomic_sub(head->truesize, &ip_frag_mem); for (fp=head->next; fp; fp = fp->next) { @@ -674,9 +674,9 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - skb_set_timestamp(head, &qp->stamp); + head->tstamp = qp->stamp; - iph = head->nh.iph; + iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); @@ -700,7 +700,6 @@ out_fail: /* Process an incoming IP datagram fragment. */ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) { - struct iphdr *iph = skb->nh.iph; struct ipq *qp; struct net_device *dev; @@ -713,7 +712,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) dev = skb->dev; /* Lookup (or create) queue header */ - if ((qp = ip_find(iph, user)) != NULL) { + if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { struct sk_buff *ret = NULL; spin_lock(&qp->lock); @@ -734,7 +733,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) return NULL; } -void ipfrag_init(void) +void __init ipfrag_init(void) { ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9151da64231..63282934725 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -191,11 +191,11 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3 return NULL; } -static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) { - __be32 remote = t->parms.iph.daddr; - __be32 local = t->parms.iph.saddr; - __be32 key = t->parms.i_key; + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; unsigned h = HASH(key); int prio = 0; @@ -209,6 +209,11 @@ static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) return &tunnels[prio][h]; } +static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +{ + return __ipgre_bucket(&t->parms); +} + static void ipgre_tunnel_link(struct ip_tunnel *t) { struct ip_tunnel **tp = ipgre_bucket(t); @@ -240,17 +245,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int __be32 key = parms->i_key; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; - unsigned h = HASH(key); - int prio = 0; char name[IFNAMSIZ]; - if (local) - prio |= 1; - if (remote && !MULTICAST(remote)) { - prio |= 2; - h ^= HASH(remote); - } - for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { if (key == t->parms.i_key) return t; @@ -320,8 +317,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr*)skb->data; __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); int grehlen = (iph->ihl<<2) + 4; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; __be16 flags; @@ -388,8 +385,8 @@ out: struct iphdr *iph = (struct iphdr*)dp; struct iphdr *eiph; __be16 *p = (__be16*)(dp+(iph->ihl<<2)); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int rel_type = 0; int rel_code = 0; __be32 rel_info = 0; @@ -422,7 +419,7 @@ out: default: return; case ICMP_PARAMETERPROB: - n = ntohl(skb->h.icmph->un.gateway) >> 24; + n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; if (n < (iph->ihl<<2)) return; @@ -442,7 +439,7 @@ out: return; case ICMP_FRAG_NEEDED: /* And it is the only really necessary thing :-) */ - n = ntohs(skb->h.icmph->un.frag.mtu); + n = ntohs(icmp_hdr(skb)->un.frag.mtu); if (n < grehlen+68) return; n -= grehlen; @@ -474,7 +471,7 @@ out: dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)eiph); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ memset(&fl, 0, sizeof(fl)); @@ -533,9 +530,9 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) { if (skb->protocol == htons(ETH_P_IP)) { - IP_ECN_set_ce(skb->nh.iph); + IP_ECN_set_ce(ip_hdr(skb)); } else if (skb->protocol == htons(ETH_P_IPV6)) { - IP6_ECN_set_ce(skb->nh.ipv6h); + IP6_ECN_set_ce(ipv6_hdr(skb)); } } } @@ -565,7 +562,7 @@ static int ipgre_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, 16)) goto drop_nolock; - iph = skb->nh.iph; + iph = ip_hdr(skb); h = skb->data; flags = *(__be16*)h; @@ -616,9 +613,10 @@ static int ipgre_rcv(struct sk_buff *skb) offset += 4; } - skb->mac.raw = skb->nh.raw; - skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->h.raw, offset); + skb_reset_mac_header(skb); + __pskb_pull(skb, offset); + skb_reset_network_header(skb); + skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST if (MULTICAST(iph->daddr)) { @@ -669,7 +667,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; __be16 df; @@ -720,7 +718,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { - addr6 = &skb->nh.ipv6h->daddr; + addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } @@ -824,11 +822,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = skb->nh.raw; - skb->nh.raw = skb_push(skb, gre_hlen); + skb->transport_header = skb->network_header; + skb_push(skb, gre_hlen); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); @@ -839,7 +838,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = df; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f38e97647ac..97069399d86 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -158,7 +158,7 @@ DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; int ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; - u8 protocol = skb->nh.iph->protocol; + u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; read_lock(&ip_ra_lock); @@ -171,7 +171,7 @@ int ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { - if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN); if (skb == NULL) { read_unlock(&ip_ra_lock); @@ -198,17 +198,15 @@ int ip_call_ra_chain(struct sk_buff *skb) static inline int ip_local_deliver_finish(struct sk_buff *skb) { - int ihl = skb->nh.iph->ihl*4; - - __skb_pull(skb, ihl); + __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ - skb->h.raw = skb->data; + skb_reset_transport_header(skb); rcu_read_lock(); { /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ - int protocol = skb->nh.iph->protocol; + int protocol = ip_hdr(skb)->protocol; int hash; struct sock *raw_sk; struct net_protocol *ipprot; @@ -220,7 +218,7 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { @@ -266,7 +264,7 @@ int ip_local_deliver(struct sk_buff *skb) * Reassemble IP fragments. */ - if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER); if (!skb) return 0; @@ -294,7 +292,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) goto drop; } - iph = skb->nh.iph; + iph = ip_hdr(skb); if (ip_options_compile(NULL, skb)) { IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); @@ -330,7 +328,8 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; /* * Initialise the virtual path cache for the packet. It describes @@ -342,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -360,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; + rt = (struct rtable*)skb->dst; + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + return dst_input(skb); drop: @@ -391,7 +398,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); /* * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. @@ -410,13 +417,16 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl*4)) + if (skb->len < len) { + IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f906a80d5a8..251346828cb 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -40,7 +40,7 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, __be32 daddr, struct rtable *rt, int is_frag) { - unsigned char * iph = skb->nh.raw; + unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); @@ -104,13 +104,13 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) return 0; } - sptr = skb->nh.raw; + sptr = skb_network_header(skb); dptr = dopt->__data; if (skb->dst) daddr = ((struct rtable*)skb->dst)->rt_spec_dst; else - daddr = skb->nh.iph->daddr; + daddr = ip_hdr(skb)->daddr; if (sopt->rr) { optlen = sptr[sopt->rr+1]; @@ -180,7 +180,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) /* * RFC1812 requires to fix illegal source routes. */ - if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) + if (memcmp(&ip_hdr(skb)->saddr, + &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { @@ -217,7 +218,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) void ip_options_fragment(struct sk_buff * skb) { - unsigned char * optptr = skb->nh.raw + sizeof(struct iphdr); + unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options * opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; @@ -264,12 +265,13 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) if (!opt) { opt = &(IPCB(skb)->opt); - iph = skb->nh.raw; + iph = skb_network_header(skb); opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); optptr = iph + sizeof(struct iphdr); opt->is_data = 0; } else { - optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); + optptr = opt->is_data ? opt->__data : + (unsigned char *)&(ip_hdr(skb)[1]); iph = optptr - sizeof(struct iphdr); } @@ -563,7 +565,7 @@ void ip_forward_options(struct sk_buff *skb) struct ip_options * opt = &(IPCB(skb)->opt); unsigned char * optptr; struct rtable *rt = (struct rtable*)skb->dst; - unsigned char *raw = skb->nh.raw; + unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; @@ -587,7 +589,7 @@ void ip_forward_options(struct sk_buff *skb) if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_rt_get_source(&optptr[srrptr-1], rt); - skb->nh.iph->daddr = rt->rt_dst; + ip_hdr(skb)->daddr = rt->rt_dst; optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); @@ -599,7 +601,7 @@ void ip_forward_options(struct sk_buff *skb) } if (opt->is_changed) { opt->is_changed = 0; - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); } } @@ -608,8 +610,8 @@ int ip_options_rcv_srr(struct sk_buff *skb) struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; - struct iphdr *iph = skb->nh.iph; - unsigned char * optptr = skb->nh.raw + opt->srr; + struct iphdr *iph = ip_hdr(skb); + unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = (struct rtable*)skb->dst; struct rtable *rt2; int err; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d096332f6c6..d6427d91851 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -95,8 +95,8 @@ __inline__ void ip_send_check(struct iphdr *iph) /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { - newskb->mac.raw = newskb->data; - __skb_pull(newskb, newskb->nh.raw - newskb->data); + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); @@ -125,11 +125,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, struct iphdr *iph; /* Build the IP header. */ - if (opt) - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); - + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; @@ -143,7 +141,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->protocol = sk->sk_protocol; iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); - skb->nh.iph = iph; if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; @@ -163,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2; @@ -192,6 +195,14 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb->dst->dev->mtu : dst_mtu(skb->dst); +} + static inline int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -201,7 +212,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) + if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -248,7 +259,7 @@ int ip_mc_output(struct sk_buff *skb) /* Multicasts with ttl 0 must not go beyond the host */ - if (skb->nh.iph->ttl == 0) { + if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } @@ -333,7 +344,9 @@ packet_routed: goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) @@ -344,7 +357,6 @@ packet_routed: iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - skb->nh.iph = iph; /* Transport layer set skb->h.foo itself. */ if (opt && opt->optlen) { @@ -386,21 +398,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif -#ifdef CONFIG_NETFILTER - /* Connection association is same as pre-frag packet */ - nf_conntrack_put(to->nfct); - to->nfct = from->nfct; - nf_conntrack_get(to->nfct); - to->nfctinfo = from->nfctinfo; + nf_copy(to, from); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(to->nf_bridge); - to->nf_bridge = from->nf_bridge; - nf_bridge_get(to->nf_bridge); -#endif -#endif skb_copy_secmark(to, from); } @@ -430,12 +431,12 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * Point into the IP datagram header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); return -EMSGSIZE; } @@ -502,10 +503,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; - frag->h.raw = frag->data; - frag->nh.raw = __skb_push(frag, hlen); - memcpy(frag->nh.raw, iph, hlen); - iph = frag->nh.iph; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) @@ -566,7 +568,7 @@ slow_path: * Keep copying data until we run out. */ - while(left > 0) { + while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) @@ -593,8 +595,8 @@ slow_path: ip_copy_metadata(skb2, skb); skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); - skb2->nh.raw = skb2->data; - skb2->h.raw = skb2->data + hlen; + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner @@ -608,19 +610,19 @@ slow_path: * Copy the packet header into the new buffer. */ - memcpy(skb2->nh.raw, skb->data, hlen); + skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ - iph = skb2->nh.iph; + iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if @@ -722,10 +724,10 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_put(skb,fragheaderlen + transhdrlen); /* initialize network header pointer */ - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* initialize protocol header pointer */ - skb->h.raw = skb->data + fragheaderlen; + skb->transport_header = skb->network_header + fragheaderlen; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -799,7 +801,9 @@ int ip_append_data(struct sock *sk, inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->u.dst.dev->mtu : + dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; @@ -929,9 +933,10 @@ alloc_new_skb: * Find where to start putting bytes. */ data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; + skb_set_network_header(skb, exthdrlen); + skb->transport_header = (skb->network_header + + fragheaderlen); data += fragheaderlen; - skb->h.raw = data + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -1100,8 +1105,6 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (len <= 0) { struct sk_buff *skb_prev; - char *data; - struct iphdr *iph; int alloclen; skb_prev = skb; @@ -1124,15 +1127,15 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, /* * Find where to start putting bytes. */ - data = skb_put(skb, fragheaderlen + fraggap); - skb->nh.iph = iph = (struct iphdr *)data; - data += fragheaderlen; - skb->h.raw = data; - + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = (skb->network_header + + fragheaderlen); if (fraggap) { - skb->csum = skb_copy_and_csum_bits( - skb_prev, maxfraglen, - data, fraggap, 0); + skb->csum = skb_copy_and_csum_bits(skb_prev, + maxfraglen, + skb_transport_header(skb), + fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); pskb_trim_unique(skb_prev, maxfraglen); @@ -1198,10 +1201,10 @@ int ip_push_pending_frames(struct sock *sk) tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ - if (skb->data < skb->nh.raw) - __skb_pull(skb, skb->nh.raw - skb->data); + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; @@ -1216,13 +1219,13 @@ int ip_push_pending_frames(struct sock *sk) * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc != IP_PMTUDISC_DO) + if (inet->pmtudisc < IP_PMTUDISC_DO) skb->local_df = 1; /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc == IP_PMTUDISC_DO || + if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); @@ -1352,11 +1355,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } }, + { .sport = tcp_hdr(skb)->dest, + .dport = tcp_hdr(skb)->source } }, .proto = sk->sk_protocol }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) @@ -1370,14 +1373,16 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar with locally disabled BH and that sk cannot be already spinlocked. */ bh_lock_sock(sk); - inet->tos = skb->nh.iph->tos; + inet->tos = ip_hdr(skb)->tos; sk->sk_priority = skb->priority; - sk->sk_protocol = skb->nh.iph->protocol; + sk->sk_protocol = ip_hdr(skb)->protocol; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) - *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + *((__sum16 *)skb_transport_header(skb) + + arg->csumoffset) = csum_fold(csum_add(skb->csum, + arg->csum)); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 23048d9f358..4d544573f48 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -59,7 +59,7 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) struct in_pktinfo info; struct rtable *rt = (struct rtable *)skb->dst; - info.ipi_addr.s_addr = skb->nh.iph->daddr; + info.ipi_addr.s_addr = ip_hdr(skb)->daddr; if (rt) { info.ipi_ifindex = rt->rt_iif; info.ipi_spec_dst.s_addr = rt->rt_spec_dst; @@ -73,13 +73,13 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { - int ttl = skb->nh.iph->ttl; + int ttl = ip_hdr(skb)->ttl; put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) { - put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); + put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) @@ -87,7 +87,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) if (IPCB(skb)->opt.optlen == 0) return; - put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, + ip_hdr(skb) + 1); } @@ -268,18 +269,21 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; - serr->ee.ee_type = skb->h.icmph->type; - serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_type = icmp_hdr(skb)->type; + serr->ee.ee_code = icmp_hdr(skb)->code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - + skb_network_header(skb); serr->port = port; - skb->h.raw = payload; - if (!skb_pull(skb, payload - skb->data) || - sock_queue_err_skb(sk, skb)) - kfree_skb(skb); + if (skb_pull(skb, payload - skb->data) != NULL) { + skb_reset_transport_header(skb); + if (sock_queue_err_skb(sk, skb) == 0) + return; + } + kfree_skb(skb); } void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) @@ -296,8 +300,9 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf if (!skb) return; - iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); - skb->nh.iph = iph; + skb_put(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->daddr = daddr; serr = SKB_EXT_ERR(skb); @@ -308,11 +313,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = port; - skb->h.raw = skb->tail; - __skb_pull(skb, skb->tail - skb->data); + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); @@ -354,7 +359,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; - sin->sin_addr.s_addr = *(__be32*)(skb->nh.raw + serr->addr_offset); + sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } @@ -366,7 +372,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); if (inet->cmsg_flags) @@ -403,20 +409,20 @@ out: */ static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) + int optname, char __user *optval, int optlen) { struct inet_sock *inet = inet_sk(sk); int val=0,err; if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | - (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | - (1<<IP_RETOPTS) | (1<<IP_TOS) | - (1<<IP_TTL) | (1<<IP_HDRINCL) | - (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | - (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC))) || - optname == IP_MULTICAST_TTL || - optname == IP_MULTICAST_LOOP) { + (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | + (1<<IP_RETOPTS) | (1<<IP_TOS) | + (1<<IP_TTL) | (1<<IP_HDRINCL) | + (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | + (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | + (1<<IP_PASSSEC))) || + optname == IP_MULTICAST_TTL || + optname == IP_MULTICAST_LOOP) { if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -440,444 +446,444 @@ static int do_ip_setsockopt(struct sock *sk, int level, lock_sock(sk); switch (optname) { - case IP_OPTIONS: - { - struct ip_options * opt = NULL; - if (optlen > 40 || optlen < 0) - goto e_inval; - err = ip_options_get_from_user(&opt, optval, optlen); - if (err) - break; - if (inet->is_icsk) { - struct inet_connection_sock *icsk = inet_csk(sk); + case IP_OPTIONS: + { + struct ip_options * opt = NULL; + if (optlen > 40 || optlen < 0) + goto e_inval; + err = ip_options_get_from_user(&opt, optval, optlen); + if (err) + break; + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if (sk->sk_family == PF_INET || - (!((1 << sk->sk_state) & - (TCPF_LISTEN | TCPF_CLOSE)) && - inet->daddr != LOOPBACK4_IPV6)) { + if (sk->sk_family == PF_INET || + (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) && + inet->daddr != LOOPBACK4_IPV6)) { #endif - if (inet->opt) - icsk->icsk_ext_hdr_len -= inet->opt->optlen; - if (opt) - icsk->icsk_ext_hdr_len += opt->optlen; - icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + if (inet->opt) + icsk->icsk_ext_hdr_len -= inet->opt->optlen; + if (opt) + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - } -#endif } - opt = xchg(&inet->opt, opt); - kfree(opt); - break; +#endif } - case IP_PKTINFO: - if (val) - inet->cmsg_flags |= IP_CMSG_PKTINFO; - else - inet->cmsg_flags &= ~IP_CMSG_PKTINFO; - break; - case IP_RECVTTL: - if (val) - inet->cmsg_flags |= IP_CMSG_TTL; - else - inet->cmsg_flags &= ~IP_CMSG_TTL; - break; - case IP_RECVTOS: - if (val) - inet->cmsg_flags |= IP_CMSG_TOS; - else - inet->cmsg_flags &= ~IP_CMSG_TOS; - break; - case IP_RECVOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RECVOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; - break; - case IP_RETOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RETOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + opt = xchg(&inet->opt, opt); + kfree(opt); + break; + } + case IP_PKTINFO: + if (val) + inet->cmsg_flags |= IP_CMSG_PKTINFO; + else + inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + break; + case IP_RECVTTL: + if (val) + inet->cmsg_flags |= IP_CMSG_TTL; + else + inet->cmsg_flags &= ~IP_CMSG_TTL; + break; + case IP_RECVTOS: + if (val) + inet->cmsg_flags |= IP_CMSG_TOS; + else + inet->cmsg_flags &= ~IP_CMSG_TOS; + break; + case IP_RECVOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RECVOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + break; + case IP_RETOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RETOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + break; + case IP_PASSSEC: + if (val) + inet->cmsg_flags |= IP_CMSG_PASSSEC; + else + inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + break; + case IP_TOS: /* This sets both TOS and Precedence */ + if (sk->sk_type == SOCK_STREAM) { + val &= ~3; + val |= inet->tos & 3; + } + if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && + !capable(CAP_NET_ADMIN)) { + err = -EPERM; break; - case IP_PASSSEC: - if (val) - inet->cmsg_flags |= IP_CMSG_PASSSEC; - else - inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + } + if (inet->tos != val) { + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } + break; + case IP_TTL: + if (optlen<1) + goto e_inval; + if (val != -1 && (val < 1 || val>255)) + goto e_inval; + inet->uc_ttl = val; + break; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; break; - case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~3; - val |= inet->tos & 3; - } - if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && - !capable(CAP_NET_ADMIN)) { - err = -EPERM; + } + inet->hdrincl = val ? 1 : 0; + break; + case IP_MTU_DISCOVER: + if (val<0 || val>3) + goto e_inval; + inet->pmtudisc = val; + break; + case IP_RECVERR: + inet->recverr = !!val; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + break; + case IP_MULTICAST_TTL: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (optlen<1) + goto e_inval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + goto e_inval; + inet->mc_ttl = val; + break; + case IP_MULTICAST_LOOP: + if (optlen<1) + goto e_inval; + inet->mc_loop = !!val; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct net_device *dev = NULL; + + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + /* + * Check the arguments are allowable + */ + + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) break; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } - break; - case IP_TTL: - if (optlen<1) - goto e_inval; - if (val != -1 && (val < 1 || val>255)) - goto e_inval; - inet->uc_ttl = val; - break; - case IP_HDRINCL: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + break; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == INADDR_ANY) { + inet->mc_index = 0; + inet->mc_addr = 0; + err = 0; break; } - inet->hdrincl = val ? 1 : 0; - break; - case IP_MTU_DISCOVER: - if (val<0 || val>2) - goto e_inval; - inet->pmtudisc = val; - break; - case IP_RECVERR: - inet->recverr = !!val; - if (!val) - skb_queue_purge(&sk->sk_error_queue); - break; - case IP_MULTICAST_TTL: - if (sk->sk_type == SOCK_STREAM) - goto e_inval; - if (optlen<1) - goto e_inval; - if (val==-1) - val = 1; - if (val < 0 || val > 255) - goto e_inval; - inet->mc_ttl = val; - break; - case IP_MULTICAST_LOOP: - if (optlen<1) - goto e_inval; - inet->mc_loop = !!val; - break; - case IP_MULTICAST_IF: - { - struct ip_mreqn mreq; - struct net_device *dev = NULL; + dev = ip_dev_find(mreq.imr_address.s_addr); + if (dev) { + mreq.imr_ifindex = dev->ifindex; + dev_put(dev); + } + } else + dev = __dev_get_by_index(mreq.imr_ifindex); - if (sk->sk_type == SOCK_STREAM) - goto e_inval; - /* - * Check the arguments are allowable - */ - err = -EFAULT; - if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq,optval,sizeof(mreq))) - break; - } else { - memset(&mreq, 0, sizeof(mreq)); - if (optlen >= sizeof(struct in_addr) && - copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) - break; - } + err = -EADDRNOTAVAIL; + if (!dev) + break; - if (!mreq.imr_ifindex) { - if (mreq.imr_address.s_addr == INADDR_ANY) { - inet->mc_index = 0; - inet->mc_addr = 0; - err = 0; - break; - } - dev = ip_dev_find(mreq.imr_address.s_addr); - if (dev) { - mreq.imr_ifindex = dev->ifindex; - dev_put(dev); - } - } else - dev = __dev_get_by_index(mreq.imr_ifindex); + err = -EINVAL; + if (sk->sk_bound_dev_if && + mreq.imr_ifindex != sk->sk_bound_dev_if) + break; + inet->mc_index = mreq.imr_ifindex; + inet->mc_addr = mreq.imr_address.s_addr; + err = 0; + break; + } - err = -EADDRNOTAVAIL; - if (!dev) - break; + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; - err = -EINVAL; - if (sk->sk_bound_dev_if && - mreq.imr_ifindex != sk->sk_bound_dev_if) + if (optlen < sizeof(struct ip_mreq)) + goto e_inval; + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + break; + } - inet->mc_index = mreq.imr_ifindex; - inet->mc_addr = mreq.imr_address.s_addr; - err = 0; + if (optname == IP_ADD_MEMBERSHIP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case IP_MSFILTER: + { + extern int sysctl_igmp_max_msf; + struct ip_msfilter *msf; + + if (optlen < IP_MSFILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; break; } + msf = kmalloc(optlen, GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(msf, optval, optlen)) { + kfree(msf); + break; + } + /* numsrc >= (1G-4) overflow in 32 bits */ + if (msf->imsf_numsrc >= 0x3ffffffcU || + msf->imsf_numsrc > sysctl_igmp_max_msf) { + kfree(msf); + err = -ENOBUFS; + break; + } + if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { + kfree(msf); + err = -EINVAL; + break; + } + err = ip_mc_msfilter(sk, msf, 0); + kfree(msf); + break; + } + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + struct ip_mreq_source mreqs; + int omode, add; - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - { - struct ip_mreqn mreq; - - if (optlen < sizeof(struct ip_mreq)) - goto e_inval; + if (optlen != sizeof(struct ip_mreq_source)) + goto e_inval; + if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; - if (optlen >= sizeof(struct ip_mreqn)) { - if(copy_from_user(&mreq,optval,sizeof(mreq))) - break; - } else { - memset(&mreq, 0, sizeof(mreq)); - if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) - break; - } - - if (optname == IP_ADD_MEMBERSHIP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); break; } - case IP_MSFILTER: - { - extern int sysctl_igmp_max_msf; - struct ip_msfilter *msf; + if (optname == IP_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == IP_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { + struct ip_mreqn mreq; - if (optlen < IP_MSFILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - msf = kmalloc(optlen, GFP_KERNEL); - if (msf == 0) { - err = -ENOBUFS; + mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; + mreq.imr_address.s_addr = mreqs.imr_interface; + mreq.imr_ifindex = 0; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) break; - } + omode = MCAST_INCLUDE; + add = 1; + } else /* IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, 0); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in *psin; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + err = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(greq))) + break; + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + goto e_inval; + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { err = -EFAULT; - if (copy_from_user(msf, optval, optlen)) { - kfree(msf); - break; - } - /* numsrc >= (1G-4) overflow in 32 bits */ - if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { - kfree(msf); - err = -ENOBUFS; - break; - } - if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { - kfree(msf); - err = -EINVAL; - break; - } - err = ip_mc_msfilter(sk, msf, 0); - kfree(msf); break; } - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - { - struct ip_mreq_source mreqs; - int omode, add; - - if (optlen != sizeof(struct ip_mreq_source)) - goto e_inval; - if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { - err = -EFAULT; - break; - } - if (optname == IP_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == IP_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { - struct ip_mreqn mreq; - - mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; - mreq.imr_address.s_addr = mreqs.imr_interface; - mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); - if (err && err != -EADDRINUSE) - break; - omode = MCAST_INCLUDE; - add = 1; - } else /* IP_DROP_SOURCE_MEMBERSHIP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, 0); + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) { + err = -EADDRNOTAVAIL; break; } - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in *psin; + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs.gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; - if (optlen < sizeof(struct group_req)) - goto e_inval; - err = -EFAULT; - if(copy_from_user(&greq, optval, sizeof(greq))) - break; - psin = (struct sockaddr_in *)&greq.gr_group; - if (psin->sin_family != AF_INET) - goto e_inval; - memset(&mreq, 0, sizeof(mreq)); + psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs.gsr_interface; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) + break; + greqs.gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, + greqs.gsr_interface); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_igmp_max_msf; + struct sockaddr_in *psin; + struct ip_msfilter *msf = NULL; + struct group_filter *gsf = NULL; + int msize, i, ifindex; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; break; } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - struct ip_mreq_source mreqs; - struct sockaddr_in *psin; - int omode, add; - - if (optlen != sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - err = -EFAULT; - break; - } - if (greqs.gsr_group.ss_family != AF_INET || - greqs.gsr_source.ss_family != AF_INET) { - err = -EADDRNOTAVAIL; - break; - } - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs.gsr_source; - mreqs.imr_sourceaddr = psin->sin_addr.s_addr; - mreqs.imr_interface = 0; /* use index for mc_source */ - - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct ip_mreqn mreq; - - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); - if (err && err != -EADDRINUSE) - break; - greqs.gsr_interface = mreq.imr_ifindex; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, - greqs.gsr_interface); + gsf = kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + err = -ENOBUFS; break; } - case MCAST_MSFILTER: - { - extern int sysctl_igmp_max_msf; - struct sockaddr_in *psin; - struct ip_msfilter *msf = NULL; - struct group_filter *gsf = NULL; - int msize, i, ifindex; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - gsf = kmalloc(optlen,GFP_KERNEL); - if (gsf == 0) { - err = -ENOBUFS; - break; - } - err = -EFAULT; - if (copy_from_user(gsf, optval, optlen)) { - goto mc_msf_out; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); - msf = kmalloc(msize,GFP_KERNEL); - if (msf == 0) { - err = -ENOBUFS; - goto mc_msf_out; - } - ifindex = gsf->gf_interface; - psin = (struct sockaddr_in *)&gsf->gf_group; - if (psin->sin_family != AF_INET) { - err = -EADDRNOTAVAIL; - goto mc_msf_out; - } - msf->imsf_multiaddr = psin->sin_addr.s_addr; - msf->imsf_interface = 0; - msf->imsf_fmode = gsf->gf_fmode; - msf->imsf_numsrc = gsf->gf_numsrc; + err = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + goto mc_msf_out; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); + msf = kmalloc(msize,GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + goto mc_msf_out; + } + ifindex = gsf->gf_interface; + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) { err = -EADDRNOTAVAIL; - for (i=0; i<gsf->gf_numsrc; ++i) { - psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - - if (psin->sin_family != AF_INET) - goto mc_msf_out; - msf->imsf_slist[i] = psin->sin_addr.s_addr; - } - kfree(gsf); - gsf = NULL; - - err = ip_mc_msfilter(sk, msf, ifindex); -mc_msf_out: - kfree(msf); - kfree(gsf); - break; + goto mc_msf_out; } - case IP_ROUTER_ALERT: - err = ip_ra_control(sk, val ? 1 : 0, NULL); - break; - - case IP_FREEBIND: - if (optlen<1) - goto e_inval; - inet->freebind = !!val; - break; + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = gsf->gf_fmode; + msf->imsf_numsrc = gsf->gf_numsrc; + err = -EADDRNOTAVAIL; + for (i=0; i<gsf->gf_numsrc; ++i) { + psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - case IP_IPSEC_POLICY: - case IP_XFRM_POLICY: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - break; - err = xfrm_user_policy(sk, optname, optval, optlen); + if (psin->sin_family != AF_INET) + goto mc_msf_out; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + kfree(gsf); + gsf = NULL; + + err = ip_mc_msfilter(sk, msf, ifindex); + mc_msf_out: + kfree(msf); + kfree(gsf); + break; + } + case IP_ROUTER_ALERT: + err = ip_ra_control(sk, val ? 1 : 0, NULL); + break; + + case IP_FREEBIND: + if (optlen<1) + goto e_inval; + inet->freebind = !!val; + break; + + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) break; + err = xfrm_user_policy(sk, optname, optval, optlen); + break; - default: - err = -ENOPROTOOPT; - break; + default: + err = -ENOPROTOOPT; + break; } release_sock(sk); return err; @@ -948,214 +954,213 @@ EXPORT_SYMBOL(compat_ip_setsockopt); */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { struct inet_sock *inet = inet_sk(sk); int val; int len; - if(level!=SOL_IP) + if (level != SOL_IP) return -EOPNOTSUPP; #ifdef CONFIG_IP_MROUTE - if(optname>=MRT_BASE && optname <=MRT_BASE+10) - { + if (optname >= MRT_BASE && optname <= MRT_BASE+10) { return ip_mroute_getsockopt(sk,optname,optval,optlen); } #endif - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; - if(len < 0) + if (len < 0) return -EINVAL; lock_sock(sk); - switch(optname) { - case IP_OPTIONS: - { - unsigned char optbuf[sizeof(struct ip_options)+40]; - struct ip_options * opt = (struct ip_options*)optbuf; - opt->optlen = 0; - if (inet->opt) - memcpy(optbuf, inet->opt, - sizeof(struct ip_options)+ - inet->opt->optlen); - release_sock(sk); - - if (opt->optlen == 0) - return put_user(0, optlen); - - ip_options_undo(opt); - - len = min_t(unsigned int, len, opt->optlen); - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval, opt->__data, len)) - return -EFAULT; - return 0; - } - case IP_PKTINFO: - val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; - break; - case IP_RECVTTL: - val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; - break; - case IP_RECVTOS: - val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; - break; - case IP_RECVOPTS: - val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; - break; - case IP_RETOPTS: - val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; - break; - case IP_PASSSEC: - val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; - break; - case IP_TOS: - val = inet->tos; - break; - case IP_TTL: - val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : - inet->uc_ttl); - break; - case IP_HDRINCL: - val = inet->hdrincl; - break; - case IP_MTU_DISCOVER: - val = inet->pmtudisc; - break; - case IP_MTU: - { - struct dst_entry *dst; - val = 0; - dst = sk_dst_get(sk); - if (dst) { - val = dst_mtu(dst); - dst_release(dst); - } - if (!val) { - release_sock(sk); - return -ENOTCONN; - } - break; + switch (optname) { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options * opt = (struct ip_options*)optbuf; + opt->optlen = 0; + if (inet->opt) + memcpy(optbuf, inet->opt, + sizeof(struct ip_options)+ + inet->opt->optlen); + release_sock(sk); + + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len = min_t(unsigned int, len, opt->optlen); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_PASSSEC: + val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; + break; + case IP_TOS: + val = inet->tos; + break; + case IP_TTL: + val = (inet->uc_ttl == -1 ? + sysctl_ip_default_ttl : + inet->uc_ttl); + break; + case IP_HDRINCL: + val = inet->hdrincl; + break; + case IP_MTU_DISCOVER: + val = inet->pmtudisc; + break; + case IP_MTU: + { + struct dst_entry *dst; + val = 0; + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); } - case IP_RECVERR: - val = inet->recverr; - break; - case IP_MULTICAST_TTL: - val = inet->mc_ttl; - break; - case IP_MULTICAST_LOOP: - val = inet->mc_loop; - break; - case IP_MULTICAST_IF: - { - struct in_addr addr; - len = min_t(unsigned int, len, sizeof(struct in_addr)); - addr.s_addr = inet->mc_addr; + if (!val) { release_sock(sk); - - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval, &addr, len)) - return -EFAULT; - return 0; + return -ENOTCONN; } - case IP_MSFILTER: - { - struct ip_msfilter msf; - int err; + break; + } + case IP_RECVERR: + val = inet->recverr; + break; + case IP_MULTICAST_TTL: + val = inet->mc_ttl; + break; + case IP_MULTICAST_LOOP: + val = inet->mc_loop; + break; + case IP_MULTICAST_IF: + { + struct in_addr addr; + len = min_t(unsigned int, len, sizeof(struct in_addr)); + addr.s_addr = inet->mc_addr; + release_sock(sk); - if (len < IP_MSFILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; - } - if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; - } - err = ip_mc_msfget(sk, &msf, - (struct ip_msfilter __user *)optval, optlen); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &addr, len)) + return -EFAULT; + return 0; + } + case IP_MSFILTER: + { + struct ip_msfilter msf; + int err; + + if (len < IP_MSFILTER_SIZE(0)) { release_sock(sk); - return err; + return -EINVAL; } - case MCAST_MSFILTER: - { - struct group_filter gsf; - int err; - - if (len < GROUP_FILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; - } - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; - } - err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { release_sock(sk); - return err; + return -EFAULT; } - case IP_PKTOPTIONS: - { - struct msghdr msg; + err = ip_mc_msfget(sk, &msf, + (struct ip_msfilter __user *)optval, optlen); + release_sock(sk); + return err; + } + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + if (len < GROUP_FILTER_SIZE(0)) { release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_gsfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + case IP_PKTOPTIONS: + { + struct msghdr msg; + + release_sock(sk); - if (sk->sk_type != SOCK_STREAM) - return -ENOPROTOOPT; + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; - msg.msg_control = optval; - msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; - if (inet->cmsg_flags & IP_CMSG_PKTINFO) { - struct in_pktinfo info; + if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + struct in_pktinfo info; - info.ipi_addr.s_addr = inet->rcv_saddr; - info.ipi_spec_dst.s_addr = inet->rcv_saddr; - info.ipi_ifindex = inet->mc_index; - put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); - } - if (inet->cmsg_flags & IP_CMSG_TTL) { - int hlim = inet->mc_ttl; - put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); - } - len -= msg.msg_controllen; - return put_user(len, optlen); + info.ipi_addr.s_addr = inet->rcv_saddr; + info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_ifindex = inet->mc_index; + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } - case IP_FREEBIND: - val = inet->freebind; - break; - default: - release_sock(sk); - return -ENOPROTOOPT; + if (inet->cmsg_flags & IP_CMSG_TTL) { + int hlim = inet->mc_ttl; + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IP_FREEBIND: + val = inet->freebind; + break; + default: + release_sock(sk); + return -ENOPROTOOPT; } release_sock(sk); if (len < sizeof(int) && len > 0 && val>=0 && val<255) { unsigned char ucval = (unsigned char)val; len = 1; - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&ucval,1)) + if (copy_to_user(optval,&ucval,1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&val,len)) + if (copy_to_user(optval,&val,len)) return -EFAULT; } return 0; } int ip_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) + int optname, char __user *optval, int __user *optlen) { int err; @@ -1169,7 +1174,7 @@ int ip_getsockopt(struct sock *sk, int level, ) { int len; - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; lock_sock(sk); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index aa704b88f01..ab86137c71d 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -43,21 +43,15 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { - int err, plen, dlen; struct ipcomp_data *ipcd = x->data; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; - - plen = skb->len; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data; + const int plen = skb->len; + int dlen = IPCOMP_SCRATCH_SIZE; + const u8 *start = skb->data; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); if (err) goto out; @@ -72,7 +66,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb->truesize += dlen - plen; __skb_put(skb, dlen - plen); - memcpy(skb->data, scratch, dlen); + skb_copy_to_linear_data(skb, scratch, dlen); out: put_cpu(); return err; @@ -90,10 +84,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ - iph = skb->nh.iph; + iph = ip_hdr(skb); ipch = (void *)skb->data; iph->protocol = ipch->nexthdr; - skb->h.raw = skb->nh.raw + sizeof(*ipch); + skb->transport_header = skb->network_header + sizeof(*ipch); __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); @@ -103,23 +97,16 @@ out: static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { - int err, plen, dlen, ihlen; - struct iphdr *iph = skb->nh.iph; struct ipcomp_data *ipcd = x->data; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; + const int ihlen = ip_hdrlen(skb); + const int plen = skb->len - ihlen; + int dlen = IPCOMP_SCRATCH_SIZE; + u8 *start = skb->data + ihlen; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - ihlen = iph->ihl * 4; - plen = skb->len - ihlen; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data + ihlen; - - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); if (err) goto out; @@ -142,12 +129,11 @@ out: static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - struct iphdr *iph; struct ip_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; int hdr_len = 0; + struct iphdr *iph = ip_hdr(skb); - iph = skb->nh.iph; iph->tot_len = htons(skb->len); hdr_len = iph->ihl * 4; if ((skb->len - hdr_len) < ipcd->threshold) { @@ -159,7 +145,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; err = ipcomp_compress(x, skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); if (err) { goto out_ok; @@ -188,8 +174,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; spi = htonl(ntohs(ipch->cpi)); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index cf49de1a498..597c800b2fd 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -432,7 +432,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop; /* Basic sanity checks can be done without the lock. */ - rarp = (struct arphdr *)skb->h.raw; + rarp = (struct arphdr *)skb_transport_header(skb); /* If this test doesn't pass, it's not IP, or we should * ignore it anyway. @@ -455,7 +455,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop; /* OK, it is all there and looks valid, process... */ - rarp = (struct arphdr *)skb->h.raw; + rarp = (struct arphdr *)skb_transport_header(skb); rarp_ptr = (unsigned char *) (rarp + 1); /* One reply at a time, please. */ @@ -702,7 +702,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d memset(b, 0, sizeof(struct bootp_pkt)); /* Construct IP header */ - skb->nh.iph = h = &b->iph; + skb_reset_network_header(skb); + h = ip_hdr(skb); h->version = 4; h->ihl = 5; h->tot_len = htons(sizeof(struct bootp_pkt)); @@ -782,7 +783,7 @@ static void __init ic_do_bootp_ext(u8 *ext) u8 *c; printk("DHCP/BOOTP: Got extension %d:",*ext); - for(c=ext+2; c<ext+2+ext[1]; c++) + for (c=ext+2; c<ext+2+ext[1]; c++) printk(" %02x", *c); printk("\n"); #endif @@ -845,7 +846,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str sizeof(struct udphdr))) goto drop; - b = (struct bootp_pkt *) skb->nh.iph; + b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) @@ -883,7 +884,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str if (!pskb_may_pull(skb, skb->len)) goto drop; - b = (struct bootp_pkt *) skb->nh.iph; + b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; /* One reply at a time, please. */ @@ -938,7 +939,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str if (opt[1] >= 4) memcpy(&server_id, opt + 2, 4); break; - }; + } } #ifdef IPCONFIG_DEBUG @@ -983,7 +984,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_myaddr = NONE; ic_servaddr = NONE; goto drop_unlock; - }; + } ic_dhcp_msgtype = mt; @@ -1094,7 +1095,7 @@ static int __init ic_dynamic(void) retries = CONF_SEND_RETRIES; get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); - for(;;) { + for (;;) { #ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3ec5ce0f549..ebd2f2d532f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -157,10 +157,10 @@ static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local) return NULL; } -static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms) { - __be32 remote = t->parms.iph.daddr; - __be32 local = t->parms.iph.saddr; + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; unsigned h = 0; int prio = 0; @@ -175,6 +175,10 @@ static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) return &tunnels[prio][h]; } +static inline struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +{ + return __ipip_bucket(&t->parms); +} static void ipip_tunnel_unlink(struct ip_tunnel *t) { @@ -206,19 +210,9 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c __be32 local = parms->iph.saddr; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; - unsigned h = 0; - int prio = 0; char name[IFNAMSIZ]; - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -280,8 +274,8 @@ static int ipip_err(struct sk_buff *skb, u32 info) ICMP in the real Internet is absolutely infeasible. */ struct iphdr *iph = (struct iphdr*)skb->data; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; @@ -336,8 +330,8 @@ out: struct iphdr *iph = (struct iphdr*)dp; int hlen = iph->ihl<<2; struct iphdr *eiph; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int rel_type = 0; int rel_code = 0; __be32 rel_info = 0; @@ -354,7 +348,7 @@ out: default: return 0; case ICMP_PARAMETERPROB: - n = ntohl(skb->h.icmph->un.gateway) >> 24; + n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; if (n < hlen) return 0; @@ -373,7 +367,7 @@ out: return 0; case ICMP_FRAG_NEEDED: /* And it is the only really necessary thing :-) */ - n = ntohs(skb->h.icmph->un.frag.mtu); + n = ntohs(icmp_hdr(skb)->un.frag.mtu); if (n < hlen+68) return 0; n -= hlen; @@ -405,7 +399,7 @@ out: dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)eiph); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ memset(&fl, 0, sizeof(fl)); @@ -461,9 +455,10 @@ out: #endif } -static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb) +static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, + struct sk_buff *skb) { - struct iphdr *inner_iph = skb->nh.iph; + struct iphdr *inner_iph = ip_hdr(skb); if (INET_ECN_is_ce(outer_iph->tos)) IP_ECN_set_ce(inner_iph); @@ -471,10 +466,8 @@ static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff static int ipip_rcv(struct sk_buff *skb) { - struct iphdr *iph; struct ip_tunnel *tunnel; - - iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); read_lock(&ipip_lock); if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { @@ -486,8 +479,8 @@ static int ipip_rcv(struct sk_buff *skb) secpath_reset(skb); - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; @@ -521,7 +514,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; @@ -615,11 +608,12 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = skb->nh.raw; - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); @@ -630,7 +624,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 601e3df6925..0ebae413ae8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -62,6 +62,7 @@ #include <linux/netfilter_ipv4.h> #include <net/ipip.h> #include <net/checksum.h> +#include <net/netlink.h> #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) #define CONFIG_IP_PIMSM 1 @@ -302,8 +303,8 @@ static void ipmr_destroy_unres(struct mfc_cache *c) atomic_dec(&cache_resolve_queue_len); - while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { - if (skb->nh.iph->version == 0) { + while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -479,7 +480,7 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if(c==NULL) + if (c==NULL) return NULL; c->mfc_un.res.minvif = MAXVIFS; return c; @@ -488,7 +489,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if(c==NULL) + if (c==NULL) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10*HZ; @@ -508,12 +509,13 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) * Play the pending entries through our router */ - while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { - if (skb->nh.iph->version == 0) { + while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { - nlh->nlmsg_len = skb->tail - (u8*)nlh; + nlh->nlmsg_len = (skb_tail_pointer(skb) - + (u8 *)nlh); } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -539,7 +541,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) { struct sk_buff *skb; - int ihl = pkt->nh.iph->ihl<<2; + const int ihl = ip_hdrlen(pkt); struct igmphdr *igmp; struct igmpmsg *msg; int ret; @@ -551,7 +553,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) #endif skb = alloc_skb(128, GFP_ATOMIC); - if(!skb) + if (!skb) return -ENOBUFS; #ifdef CONFIG_IP_PIMSM @@ -561,14 +563,17 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) And all this only to mangle msg->im_msgtype and to set msg->im_mbz to "mbz" :-) */ - msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); - skb->nh.raw = skb->h.raw = (u8*)msg; - memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + msg = (struct igmpmsg *)skb_network_header(skb); + memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = IGMPMSG_WHOLEPKT; msg->im_mbz = 0; msg->im_vif = reg_vif_num; - skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; - skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + + sizeof(struct iphdr)); } else #endif { @@ -577,10 +582,11 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) * Copy the IP header */ - skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); - memcpy(skb->data,pkt->data,ihl); - skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ - msg = (struct igmpmsg*)skb->nh.iph; + skb->network_header = skb->tail; + skb_put(skb, ihl); + skb_copy_to_linear_data(skb, pkt->data, ihl); + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; skb->dst = dst_clone(pkt->dst); @@ -592,8 +598,8 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) igmp->type = msg->im_msgtype = assert; igmp->code = 0; - skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ - skb->h.raw = skb->nh.raw; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + skb->transport_header = skb->network_header; } if (mroute_socket == NULL) { @@ -622,11 +628,12 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) { int err; struct mfc_cache *c; + const struct iphdr *iph = ip_hdr(skb); spin_lock_bh(&mfc_unres_lock); for (c=mfc_unres_queue; c; c=c->next) { - if (c->mfc_mcastgrp == skb->nh.iph->daddr && - c->mfc_origin == skb->nh.iph->saddr) + if (c->mfc_mcastgrp == iph->daddr && + c->mfc_origin == iph->saddr) break; } @@ -646,9 +653,9 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) /* * Fill in the new cache entry */ - c->mfc_parent=-1; - c->mfc_origin=skb->nh.iph->saddr; - c->mfc_mcastgrp=skb->nh.iph->daddr; + c->mfc_parent = -1; + c->mfc_origin = iph->saddr; + c->mfc_mcastgrp = iph->daddr; /* * Reflect first query at mrouted. @@ -734,7 +741,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) return 0; } - if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c=ipmr_cache_alloc(); @@ -788,7 +795,7 @@ static void mroute_clean_tables(struct sock *sk) /* * Shut down all active vif entries */ - for(i=0; i<maxvif; i++) { + for (i=0; i<maxvif; i++) { if (!(vif_table[i].flags&VIFF_STATIC)) vif_delete(i); } @@ -858,119 +865,117 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt struct vifctl vif; struct mfcctl mfc; - if(optname!=MRT_INIT) - { - if(sk!=mroute_socket && !capable(CAP_NET_ADMIN)) + if (optname != MRT_INIT) { + if (sk != mroute_socket && !capable(CAP_NET_ADMIN)) return -EACCES; } - switch(optname) - { - case MRT_INIT: - if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->num != IPPROTO_IGMP) - return -EOPNOTSUPP; - if(optlen!=sizeof(int)) - return -ENOPROTOOPT; - - rtnl_lock(); - if (mroute_socket) { - rtnl_unlock(); - return -EADDRINUSE; - } - - ret = ip_ra_control(sk, 1, mrtsock_destruct); - if (ret == 0) { - write_lock_bh(&mrt_lock); - mroute_socket=sk; - write_unlock_bh(&mrt_lock); + switch (optname) { + case MRT_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->num != IPPROTO_IGMP) + return -EOPNOTSUPP; + if (optlen!=sizeof(int)) + return -ENOPROTOOPT; - ipv4_devconf.mc_forwarding++; - } + rtnl_lock(); + if (mroute_socket) { rtnl_unlock(); - return ret; - case MRT_DONE: - if (sk!=mroute_socket) - return -EACCES; - return ip_ra_control(sk, 0, NULL); - case MRT_ADD_VIF: - case MRT_DEL_VIF: - if(optlen!=sizeof(vif)) - return -EINVAL; - if (copy_from_user(&vif,optval,sizeof(vif))) - return -EFAULT; - if(vif.vifc_vifi >= MAXVIFS) - return -ENFILE; - rtnl_lock(); - if (optname==MRT_ADD_VIF) { - ret = vif_add(&vif, sk==mroute_socket); - } else { - ret = vif_delete(vif.vifc_vifi); - } - rtnl_unlock(); - return ret; + return -EADDRINUSE; + } + + ret = ip_ra_control(sk, 1, mrtsock_destruct); + if (ret == 0) { + write_lock_bh(&mrt_lock); + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + + ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; + case MRT_DONE: + if (sk!=mroute_socket) + return -EACCES; + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if (optlen!=sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif,optval,sizeof(vif))) + return -EFAULT; + if (vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + rtnl_lock(); + if (optname==MRT_ADD_VIF) { + ret = vif_add(&vif, sk==mroute_socket); + } else { + ret = vif_delete(vif.vifc_vifi); + } + rtnl_unlock(); + return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ - case MRT_ADD_MFC: - case MRT_DEL_MFC: - if(optlen!=sizeof(mfc)) - return -EINVAL; - if (copy_from_user(&mfc,optval, sizeof(mfc))) - return -EFAULT; - rtnl_lock(); - if (optname==MRT_DEL_MFC) - ret = ipmr_mfc_delete(&mfc); - else - ret = ipmr_mfc_add(&mfc, sk==mroute_socket); - rtnl_unlock(); - return ret; + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if (optlen!=sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname==MRT_DEL_MFC) + ret = ipmr_mfc_delete(&mfc); + else + ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + rtnl_unlock(); + return ret; /* * Control PIM assert. */ - case MRT_ASSERT: - { - int v; - if(get_user(v,(int __user *)optval)) - return -EFAULT; - mroute_do_assert=(v)?1:0; - return 0; - } + case MRT_ASSERT: + { + int v; + if (get_user(v,(int __user *)optval)) + return -EFAULT; + mroute_do_assert=(v)?1:0; + return 0; + } #ifdef CONFIG_IP_PIMSM - case MRT_PIM: - { - int v, ret; - if(get_user(v,(int __user *)optval)) - return -EFAULT; - v = (v)?1:0; - rtnl_lock(); - ret = 0; - if (v != mroute_do_pim) { - mroute_do_pim = v; - mroute_do_assert = v; + case MRT_PIM: + { + int v, ret; + if (get_user(v,(int __user *)optval)) + return -EFAULT; + v = (v)?1:0; + rtnl_lock(); + ret = 0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; #ifdef CONFIG_IP_PIMSM_V2 - if (mroute_do_pim) - ret = inet_add_protocol(&pim_protocol, - IPPROTO_PIM); - else - ret = inet_del_protocol(&pim_protocol, - IPPROTO_PIM); - if (ret < 0) - ret = -EAGAIN; + if (mroute_do_pim) + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); + else + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; #endif - } - rtnl_unlock(); - return ret; } + rtnl_unlock(); + return ret; + } #endif - /* - * Spurious command, or MRT_VERSION which you cannot - * set. - */ - default: - return -ENOPROTOOPT; + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; } } @@ -983,7 +988,7 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u int olr; int val; - if(optname!=MRT_VERSION && + if (optname!=MRT_VERSION && #ifdef CONFIG_IP_PIMSM optname!=MRT_PIM && #endif @@ -997,17 +1002,17 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u if (olr < 0) return -EINVAL; - if(put_user(olr,optlen)) + if (put_user(olr,optlen)) return -EFAULT; - if(optname==MRT_VERSION) + if (optname==MRT_VERSION) val=0x0305; #ifdef CONFIG_IP_PIMSM - else if(optname==MRT_PIM) + else if (optname==MRT_PIM) val=mroute_do_pim; #endif else val=mroute_do_assert; - if(copy_to_user(optval,&val,olr)) + if (copy_to_user(optval,&val,olr)) return -EFAULT; return 0; } @@ -1023,48 +1028,47 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) struct vif_device *vif; struct mfc_cache *c; - switch(cmd) - { - case SIOCGETVIFCNT: - if (copy_from_user(&vr,arg,sizeof(vr))) - return -EFAULT; - if(vr.vifi>=maxvif) - return -EINVAL; - read_lock(&mrt_lock); - vif=&vif_table[vr.vifi]; - if(VIF_EXISTS(vr.vifi)) { - vr.icount=vif->pkt_in; - vr.ocount=vif->pkt_out; - vr.ibytes=vif->bytes_in; - vr.obytes=vif->bytes_out; - read_unlock(&mrt_lock); - - if (copy_to_user(arg,&vr,sizeof(vr))) - return -EFAULT; - return 0; - } + switch (cmd) { + case SIOCGETVIFCNT: + if (copy_from_user(&vr,arg,sizeof(vr))) + return -EFAULT; + if (vr.vifi>=maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif=&vif_table[vr.vifi]; + if (VIF_EXISTS(vr.vifi)) { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; read_unlock(&mrt_lock); - return -EADDRNOTAVAIL; - case SIOCGETSGCNT: - if (copy_from_user(&sr,arg,sizeof(sr))) - return -EFAULT; - read_lock(&mrt_lock); - c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); - if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); - - if (copy_to_user(arg,&sr,sizeof(sr))) - return -EFAULT; - return 0; - } + if (copy_to_user(arg,&vr,sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr,arg,sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; read_unlock(&mrt_lock); - return -EADDRNOTAVAIL; - default: - return -ENOIOCTLCMD; + + if (copy_to_user(arg,&sr,sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; } } @@ -1076,7 +1080,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; - for(ct=0;ct<maxvif;ct++,v++) { + for (ct=0;ct<maxvif;ct++,v++) { if (v->dev==ptr) vif_delete(ct); } @@ -1096,11 +1100,17 @@ static struct notifier_block ip_mr_notifier={ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) { - struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + struct iphdr *iph; + struct iphdr *old_iph = ip_hdr(skb); + + skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->version = 4; - iph->tos = skb->nh.iph->tos; - iph->ttl = skb->nh.iph->ttl; + iph->tos = old_iph->tos; + iph->ttl = old_iph->ttl; iph->frag_off = 0; iph->daddr = daddr; iph->saddr = saddr; @@ -1110,8 +1120,6 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) ip_select_ident(iph, skb->dst, NULL); ip_send_check(iph); - skb->h.ipiph = skb->nh.iph; - skb->nh.iph = iph; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset(skb); } @@ -1134,7 +1142,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &vif_table[vifi]; struct net_device *dev; struct rtable *rt; @@ -1200,8 +1208,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) dst_release(skb->dst); skb->dst = &rt->u.dst; - iph = skb->nh.iph; - ip_decrease_ttl(iph); + ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ @@ -1301,7 +1308,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local * Forward the frame */ for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { - if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { + if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) @@ -1347,7 +1354,7 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) return 0; - } else if (skb->nh.iph->protocol == IPPROTO_IGMP){ + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ /* IGMPv1 (and broken IGMPv2 implementations sort of Cisco IOS <= 11.2(8)) do not put router alert option to IGMP packets destined to routable @@ -1366,7 +1373,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* * No usable cache entry @@ -1426,14 +1433,15 @@ int pim_rcv_v1(struct sk_buff * skb) if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; - pim = (struct igmphdr*)skb->h.raw; + pim = igmp_hdr(skb); if (!mroute_do_pim || skb->len < sizeof(*pim) + sizeof(*encap) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; - encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + encap = (struct iphdr *)(skb_transport_header(skb) + + sizeof(struct igmphdr)); /* Check that: a. packet is really destinted to a multicast group @@ -1455,9 +1463,9 @@ int pim_rcv_v1(struct sk_buff * skb) if (reg_dev == NULL) goto drop; - skb->mac.raw = skb->nh.raw; + skb->mac_header = skb->network_header; skb_pull(skb, (u8*)encap - skb->data); - skb->nh.iph = (struct iphdr *)skb->data; + skb_reset_network_header(skb); skb->dev = reg_dev; skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; @@ -1486,7 +1494,7 @@ static int pim_rcv(struct sk_buff * skb) if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; - pim = (struct pimreghdr*)skb->h.raw; + pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || (pim->flags&PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && @@ -1494,7 +1502,8 @@ static int pim_rcv(struct sk_buff * skb) goto drop; /* check if the inner packet is destined to mcast group */ - encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + encap = (struct iphdr *)(skb_transport_header(skb) + + sizeof(struct pimreghdr)); if (!MULTICAST(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) @@ -1510,9 +1519,9 @@ static int pim_rcv(struct sk_buff * skb) if (reg_dev == NULL) goto drop; - skb->mac.raw = skb->nh.raw; + skb->mac_header = skb->network_header; skb_pull(skb, (u8*)encap - skb->data); - skb->nh.iph = (struct iphdr *)skb->data; + skb_reset_network_header(skb); skb->dev = reg_dev; skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; @@ -1537,7 +1546,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct net_device *dev = vif_table[c->mfc_parent].dev; - u8 *b = skb->tail; + u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; if (dev) @@ -1557,12 +1566,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) } } mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb->tail - (u8*)mp_head; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; rtm->rtm_type = RTN_MULTICAST; return 1; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -EMSGSIZE; } @@ -1577,6 +1586,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) if (cache==NULL) { struct sk_buff *skb2; + struct iphdr *iph; struct net_device *dev; int vif; @@ -1596,11 +1606,13 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) return -ENOMEM; } - skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr)); - skb2->nh.iph->ihl = sizeof(struct iphdr)>>2; - skb2->nh.iph->saddr = rt->rt_src; - skb2->nh.iph->daddr = rt->rt_dst; - skb2->nh.iph->version = 0; + skb_push(skb2, sizeof(struct iphdr)); + skb_reset_network_header(skb2); + iph = ip_hdr(skb2); + iph->ihl = sizeof(struct iphdr) >> 2; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + iph->version = 0; err = ipmr_cache_unresolved(vif, skb2); read_unlock(&mrt_lock); return err; @@ -1625,7 +1637,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, loff_t pos) { for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { - if(!VIF_EXISTS(iter->ct)) + if (!VIF_EXISTS(iter->ct)) continue; if (pos-- == 0) return &vif_table[iter->ct]; @@ -1649,7 +1661,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ipmr_vif_seq_idx(iter, 0); while (++iter->ct < maxvif) { - if(!VIF_EXISTS(iter->ct)) + if (!VIF_EXISTS(iter->ct)) continue; return &vif_table[iter->ct]; } @@ -1680,7 +1692,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations ipmr_vif_seq_ops = { +static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, .next = ipmr_vif_seq_next, .stop = ipmr_vif_seq_stop, @@ -1732,14 +1744,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) it->cache = mfc_cache_array; read_lock(&mrt_lock); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) - for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) + for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) if (pos-- == 0) return mfc; read_unlock(&mrt_lock); it->cache = &mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); - for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) + for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) if (pos-- == 0) return mfc; spin_unlock_bh(&mfc_unres_lock); @@ -1829,9 +1841,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.wrong_if); if (it->cache != &mfc_unres_queue) { - for(n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++ ) { - if(VIF_EXISTS(n) + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++ ) { + if (VIF_EXISTS(n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -1843,7 +1855,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations ipmr_mfc_seq_ops = { +static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = ipmr_mfc_seq_next, .stop = ipmr_mfc_seq_stop, diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 22e104c6a49..15ad5dd2d98 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -331,14 +331,14 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb, struct ip_vs_app *app) { int diff; - unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + const unsigned int tcp_offset = ip_hdrlen(*pskb); struct tcphdr *th; __u32 seq; if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) return 0; - th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset); /* * Remember seq number in case this pkt gets resized @@ -406,14 +406,14 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb, struct ip_vs_app *app) { int diff; - unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + const unsigned int tcp_offset = ip_hdrlen(*pskb); struct tcphdr *th; __u32 seq; if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) return 0; - th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset); /* * Remember seq number in case this pkt gets resized @@ -577,7 +577,6 @@ static const struct file_operations ip_vs_app_fops = { int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, char *o_buf, int o_len, char *n_buf, int n_len) { - struct iphdr *iph; int diff; int o_offset; int o_left; @@ -603,12 +602,11 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, skb_put(skb, diff); memmove(skb->data + o_offset + n_len, skb->data + o_offset + o_len, o_left); - memcpy(skb->data + o_offset, n_buf, n_len); + skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); } /* must update the iph total length here */ - iph = skb->nh.iph; - iph->tot_len = htons(skb->len); + ip_hdr(skb)->tot_len = htons(skb->len); LeaveFunction(9); return 0; diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 24d7b66eb6d..f005a2f929f 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -212,7 +212,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ @@ -381,7 +381,7 @@ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; __be16 _ports[2], *pptr; @@ -447,7 +447,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp) { __be16 _ports[2], *pptr; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); pptr = skb_header_pointer(skb, iph->ihl*4, sizeof(_ports), _ports); @@ -546,7 +546,7 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { skb = ip_defrag(skb, user); if (skb) - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); return skb; } @@ -557,9 +557,10 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { @@ -617,14 +618,14 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) *related = 1; /* reassemble IP fragments */ - if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; *pskb = skb; } - iph = skb->nh.iph; + iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) @@ -659,7 +660,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; @@ -680,8 +681,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) } /* Ensure the checksum is correct */ - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - ip_vs_checksum_complete(skb, ihl)) { + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); @@ -712,8 +712,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb) { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; @@ -740,14 +739,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, if (skb->ipvs_property) return NF_ACCEPT; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(iph->protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(pskb, &related); if (related) return verdict; skb = *pskb; - iph = skb->nh.iph; + iph = ip_hdr(skb); } pp = ip_vs_proto_get(iph->protocol); @@ -755,12 +754,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && + if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; - iph = skb->nh.iph; + iph = ip_hdr(skb); *pskb = skb; } @@ -810,8 +809,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) goto drop; skb = *pskb; - skb->nh.iph->saddr = cp->vaddr; - ip_send_check(skb->nh.iph); + ip_hdr(skb)->saddr = cp->vaddr; + ip_send_check(ip_hdr(skb)); /* For policy routing, packets originating from this * machine itself may be routed differently to packets @@ -861,7 +860,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) *related = 1; /* reassemble IP fragments */ - if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); @@ -870,7 +869,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) *pskb = skb; } - iph = skb->nh.iph; + iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) @@ -905,7 +904,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; @@ -921,8 +920,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) verdict = NF_DROP; /* Ensure the checksum is correct */ - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - ip_vs_checksum_complete(skb, ihl)) { + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); @@ -966,19 +964,19 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, || skb->dev == &loopback_dev || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", skb->pkt_type, - skb->nh.iph->protocol, - NIPQUAD(skb->nh.iph->daddr)); + ip_hdr(skb)->protocol, + NIPQUAD(ip_hdr(skb)->daddr)); return NF_ACCEPT; } - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(iph->protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); if (related) return verdict; skb = *pskb; - iph = skb->nh.iph; + iph = ip_hdr(skb); } /* Protocol supported? */ @@ -1064,7 +1062,7 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, { int r; - if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) + if (ip_hdr(*pskb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; return ip_vs_in_icmp(pskb, &r, hooknum); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index 502111fba87..dcf5d46aaa5 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -204,7 +204,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 847c47af040..344ddbbdc75 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -159,10 +159,10 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 0; if (cp->app_data == &ip_vs_ftp_pasv) { - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); data = (char *)th + (th->doff << 2); - data_limit = (*pskb)->tail; + data_limit = skb_tail_pointer(*pskb); if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, @@ -262,14 +262,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Detecting whether it is passive */ - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); /* Since there may be OPTIONS in the TCP packet and the HLEN is the length of the header in 32-bit multiples, it is accurate to calculate data address by th+HLEN*4 */ data = data_start = (char *)th + (th->doff << 2); - data_limit = (*pskb)->tail; + data_limit = skb_tail_pointer(*pskb); while (data <= data_limit - 6) { if (strnicmp(data, "PASV\r\n", 6) == 0) { diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c801273cb88..052f4ed5917 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -521,7 +521,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_dest *dest; struct ip_vs_lblc_table *tbl; struct ip_vs_lblc_entry *en; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 23f9b9e73c8..6225acac7a3 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -775,7 +775,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_dest *dest; struct ip_vs_lblcr_table *tbl; struct ip_vs_lblcr_entry *en; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 8b0505b0931..a842676e1c6 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -52,15 +52,15 @@ ah_conn_in_get(const struct sk_buff *skb, if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { @@ -89,15 +89,15 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 16a9ebee2fe..e65577a7700 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -76,16 +76,15 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_service *svc; struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } if (th->syn && - (svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, - skb->nh.iph->daddr, th->dest))) { + (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, + ip_hdr(skb)->daddr, th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -127,7 +126,7 @@ tcp_snat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int tcphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) @@ -143,7 +142,7 @@ tcp_snat_handler(struct sk_buff **pskb, return 0; } - tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph = (void *)ip_hdr(*pskb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ @@ -175,7 +174,7 @@ tcp_dnat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int tcphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) @@ -194,7 +193,7 @@ tcp_dnat_handler(struct sk_buff **pskb, return 0; } - tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph = (void *)ip_hdr(*pskb) + tcphoff; tcph->dest = cp->dport; /* @@ -224,15 +223,15 @@ tcp_dnat_handler(struct sk_buff **pskb, static int tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) { - unsigned int tcphoff = skb->nh.iph->ihl*4; + const unsigned int tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, - skb->nh.iph->protocol, skb->csum)) { + ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); return 0; @@ -467,8 +466,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) return 0; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 03f0a414cfa..8ee5fe6a101 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -22,7 +22,7 @@ #include <linux/udp.h> #include <net/ip_vs.h> - +#include <net/ip.h> static struct ip_vs_conn * udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -56,7 +56,7 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, + pptr = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ports), _ports); if (pptr == NULL) return NULL; @@ -82,15 +82,15 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_service *svc; struct udphdr _udph, *uh; - uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + uh = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } - if ((svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, - skb->nh.iph->daddr, uh->dest))) { + if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, + ip_hdr(skb)->daddr, uh->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -133,7 +133,7 @@ udp_snat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int udphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) @@ -151,7 +151,7 @@ udp_snat_handler(struct sk_buff **pskb, return 0; } - udph = (void *)(*pskb)->nh.iph + udphoff; + udph = (void *)ip_hdr(*pskb) + udphoff; udph->source = cp->vport; /* @@ -187,7 +187,7 @@ udp_dnat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + unsigned int udphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) @@ -206,7 +206,7 @@ udp_dnat_handler(struct sk_buff **pskb, return 0; } - udph = (void *)(*pskb)->nh.iph + udphoff; + udph = (void *)ip_hdr(*pskb) + udphoff; udph->dest = cp->dport; /* @@ -239,7 +239,7 @@ static int udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) { struct udphdr _udph, *uh; - unsigned int udphoff = skb->nh.iph->ihl*4; + const unsigned int udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -251,10 +251,10 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(skb->nh.iph->saddr, - skb->nh.iph->daddr, + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, skb->len - udphoff, - skb->nh.iph->protocol, + ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 338668f88fe..1b25b00ef1e 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -201,7 +201,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index e1f77bd7c9a..900ce29db38 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -156,7 +156,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); u8 tos = iph->tos; int mtu; struct flowi fl = { @@ -178,7 +178,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); @@ -193,7 +193,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); return NF_STOLEN; } - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); /* drop old route */ dst_release(skb->dst); @@ -226,7 +226,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rtable *rt; /* Route to the other host */ int mtu; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); EnterFunction(10); @@ -245,7 +245,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); @@ -266,8 +266,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) goto tx_error; - skb->nh.iph->daddr = cp->daddr; - ip_send_check(skb->nh.iph); + ip_hdr(skb)->daddr = cp->daddr; + ip_send_check(ip_hdr(skb)); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); @@ -320,19 +320,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; + sk_buff_data_t old_transport_header = skb->transport_header; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ int mtu; EnterFunction(10); - if (skb->protocol != __constant_htons(ETH_P_IP)) { + if (skb->protocol != htons(ETH_P_IP)) { IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " "ETH_P_IP: %d, skb protocol: %d\n", - __constant_htons(ETH_P_IP), skb->protocol); + htons(ETH_P_IP), skb->protocol); goto tx_error; } @@ -350,9 +351,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); - df |= (old_iph->frag_off&__constant_htons(IP_DF)); + df |= (old_iph->frag_off & htons(IP_DF)); - if ((old_iph->frag_off&__constant_htons(IP_DF)) + if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -377,15 +378,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = (void *) old_iph; + skb->transport_header = old_transport_header; /* fix old IP header checksum */ ip_send_check(old_iph); - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ @@ -395,7 +397,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; @@ -435,7 +437,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); int mtu; EnterFunction(10); @@ -445,7 +447,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); @@ -460,7 +462,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); return NF_STOLEN; } - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); /* drop old route */ dst_release(skb->dst); @@ -514,12 +516,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) goto tx_error_icmp; /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index 574c735836f..b03c5ca2c82 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -100,7 +100,7 @@ static int drr_dev_event(struct notifier_block *this, spin_unlock_bh(&state_lock); break; - }; + } return NOTIFY_DONE; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 6069a11514f..b44192924f9 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -10,7 +10,7 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) { - struct iphdr *iph = (*pskb)->nh.iph; + const struct iphdr *iph = ip_hdr(*pskb); struct rtable *rt; struct flowi fl = {}; struct dst_entry *odst; @@ -142,7 +142,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) struct ip_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP_LOCAL_OUT) { - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; rt_info->daddr = iph->daddr; @@ -155,7 +155,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) const struct ip_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = (*pskb)->nh.iph; + const struct iphdr *iph = ip_hdr(*pskb); if (!(iph->tos == rt_info->tos && iph->daddr == rt_info->daddr @@ -168,7 +168,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 601808c796e..46509fae9fd 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -30,188 +30,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -# connection tracking, helpers and protocols -config IP_NF_CT_ACCT - bool "Connection tracking flow accounting" - depends on IP_NF_CONNTRACK - help - If this option is enabled, the connection tracking code will - keep per-flow packet and byte counters. - - Those counters can be used for flow-based accounting or the - `connbytes' match. - - If unsure, say `N'. - -config IP_NF_CONNTRACK_MARK - bool 'Connection mark tracking support' - depends on IP_NF_CONNTRACK - help - This option enables support for connection marks, used by the - `CONNMARK' target and `connmark' match. Similar to the mark value - of packets, but this mark value is kept in the conntrack session - instead of the individual packets. - -config IP_NF_CONNTRACK_SECMARK - bool 'Connection tracking security mark support' - depends on IP_NF_CONNTRACK && NETWORK_SECMARK - help - This option enables security markings to be applied to - connections. Typically they are copied to connections from - packets using the CONNSECMARK target and copied back from - connections to packets with the same target, with the packets - being originally labeled via SECMARK. - - If unsure, say 'N'. - -config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events (EXPERIMENTAL)" - depends on EXPERIMENTAL && IP_NF_CONNTRACK - help - If this option is enabled, the connection tracking code will - provide a notifier chain that can be used by other kernel code - to get notified about changes in the connection tracking state. - - IF unsure, say `N'. - -config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface (EXPERIMENTAL)' - depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK - depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m - depends on IP_NF_NAT=n || IP_NF_NAT - help - This option enables support for a netlink-based userspace interface - - -config IP_NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - With this option enabled, the connection tracking code will - be able to do state tracking on SCTP connections. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_FTP - tristate "FTP protocol support" - depends on IP_NF_CONNTRACK - help - Tracking FTP connections is problematic: special helpers are - required for tracking them, and doing masquerading and other forms - of Network Address Translation on them. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_IRC - tristate "IRC protocol support" - depends on IP_NF_CONNTRACK - ---help--- - There is a commonly-used extension to IRC called - Direct Client-to-Client Protocol (DCC). This enables users to send - files to each other, and also chat to each other without the need - of a server. DCC Sending is used anywhere you send files over IRC, - and DCC Chat is most commonly used by Eggdrop bots. If you are - using NAT, this extension will enable you to send files and initiate - chats. Note that you do NOT need this extension to get files or - have others initiate chats, or everything else in IRC. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_NETBIOS_NS - tristate "NetBIOS name service protocol support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - NetBIOS name service requests are sent as broadcast messages from an - unprivileged port and responded to with unicast messages to the - same port. This make them hard to firewall properly because connection - tracking doesn't deal with broadcasts. This helper tracks locally - originating NetBIOS name service requests and the corresponding - responses. It relies on correct IP address configuration, specifically - netmask and broadcast address. When properly configured, the output - of "ip address show" should look similar to this: - - $ ip -4 address show eth0 - 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000 - inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0 - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_TFTP - tristate "TFTP protocol support" - depends on IP_NF_CONNTRACK - help - TFTP connection tracking helper, this is required depending - on how restrictive your ruleset is. - If you are using a tftp client behind -j SNAT or -j MASQUERADING - you will need this. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_AMANDA - tristate "Amanda backup protocol support" - depends on IP_NF_CONNTRACK - select TEXTSEARCH - select TEXTSEARCH_KMP - help - If you are running the Amanda backup package <http://www.amanda.org/> - on this machine or machines that will be MASQUERADED through this - machine, then you may want to enable this feature. This allows the - connection tracking and natting code to allow the sub-channels that - Amanda requires for communication of the backup data, messages and - index. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_PPTP - tristate 'PPTP protocol support' - depends on IP_NF_CONNTRACK - help - This module adds support for PPTP (Point to Point Tunnelling - Protocol, RFC2637) connection tracking and NAT. - - If you are running PPTP sessions over a stateful firewall or NAT - box, you may want to enable this feature. - - Please note that not all PPTP modes of operation are supported yet. - For more info, read top of the file - net/ipv4/netfilter/ip_conntrack_pptp.c - - If you want to compile it as a module, say M here and read - Documentation/modules.txt. If unsure, say `N'. - -config IP_NF_H323 - tristate 'H.323 protocol support (EXPERIMENTAL)' - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - H.323 is a VoIP signalling protocol from ITU-T. As one of the most - important VoIP protocols, it is widely used by voice hardware and - software including voice gateways, IP phones, Netmeeting, OpenPhone, - Gnomemeeting, etc. - - With this module you can support H.323 on a connection tracking/NAT - firewall. - - This module supports RAS, Fast Start, H.245 Tunnelling, Call - Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, - whiteboard, file transfer, etc. For more information, please - visit http://nath323.sourceforge.net/. - - If you want to compile it as a module, say 'M' here and read - Documentation/modules.txt. If unsure, say 'N'. - -config IP_NF_SIP - tristate "SIP protocol support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - SIP is an application-layer control protocol that can establish, - modify, and terminate multimedia sessions (conferences) such as - Internet telephony calls. With the ip_conntrack_sip and - the ip_nat_sip modules you can support the protocol on a connection - tracking/NATing firewall. - - To compile it as a module, choose M here. If unsure, say Y. - config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -361,17 +179,6 @@ config IP_NF_TARGET_ULOG To compile it as a module, choose M here. If unsure, say N. -# NAT + specific targets: ip_conntrack -config IP_NF_NAT - tristate "Full NAT" - depends on IP_NF_IPTABLES && IP_NF_CONNTRACK - help - The Full NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in iptables: see the man page for iptables(8). - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack config NF_NAT tristate "Full NAT" @@ -383,11 +190,6 @@ config NF_NAT To compile it as a module, choose M here. If unsure, say N. -config IP_NF_NAT_NEEDED - bool - depends on IP_NF_NAT - default y - config NF_NAT_NEEDED bool depends on NF_NAT @@ -395,7 +197,7 @@ config NF_NAT_NEEDED config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and @@ -407,7 +209,7 @@ config IP_NF_TARGET_MASQUERADE config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to @@ -418,7 +220,7 @@ config IP_NF_TARGET_REDIRECT config IP_NF_TARGET_NETMAP tristate "NETMAP target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host @@ -429,28 +231,13 @@ config IP_NF_TARGET_NETMAP config IP_NF_TARGET_SAME tristate "SAME target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help This option adds a `SAME' target, which works like the standard SNAT target, but attempts to give clients the same IP for all connections. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support (EXPERIMENTAL)" - depends on EXPERIMENTAL && IP_NF_NAT - ---help--- - - This module implements an Application Layer Gateway (ALG) for - SNMP payloads. In conjunction with NAT, it allows a network - management system to access multiple private networks with - conflicting addresses. It works by modifying IP addresses - inside SNMP payloads to match IP-layer NAT mapping. - - This is the "basic" form of SNMP-ALG, as described in RFC 2962 - - To compile it as a module, choose M here. If unsure, say N. - config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_NAT @@ -477,78 +264,37 @@ config NF_NAT_PROTO_GRE tristate depends on NF_NAT && NF_CT_PROTO_GRE -config IP_NF_NAT_FTP - tristate - depends on IP_NF_IPTABLES && IP_NF_CONNTRACK && IP_NF_NAT - default IP_NF_NAT && IP_NF_FTP - config NF_NAT_FTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_FTP -config IP_NF_NAT_IRC - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_IRC=y - default m if IP_NF_IRC=m - config NF_NAT_IRC tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_IRC -config IP_NF_NAT_TFTP - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_TFTP=y - default m if IP_NF_TFTP=m - config NF_NAT_TFTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP -config IP_NF_NAT_AMANDA - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_AMANDA=y - default m if IP_NF_AMANDA=m - config NF_NAT_AMANDA tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_AMANDA -config IP_NF_NAT_PPTP - tristate - depends on IP_NF_NAT!=n && IP_NF_PPTP!=n - default IP_NF_NAT if IP_NF_PPTP=y - default m if IP_NF_PPTP=m - config NF_NAT_PPTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_PPTP select NF_NAT_PROTO_GRE -config IP_NF_NAT_H323 - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_H323=y - default m if IP_NF_H323=m - config NF_NAT_H323 tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_H323 -config IP_NF_NAT_SIP - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_SIP=y - default m if IP_NF_SIP=m - config NF_NAT_SIP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT @@ -606,9 +352,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" depends on IP_NF_MANGLE && EXPERIMENTAL - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - select IP_NF_CONNTRACK_MARK if IP_NF_CONNTRACK - select NF_CONNTRACK_MARK if NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK_IPV4 + select NF_CONNTRACK_MARK help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 6625ec68180..409d273f6f8 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -2,8 +2,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for the standalone - connection tracking / NAT -ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o # objects for l3 independent conntrack nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) @@ -12,53 +10,14 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o endif endif -ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o -nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o -ifneq ($(CONFIG_NF_NAT),) +nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o -else -iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o -endif - -ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o -ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o - -ip_conntrack_h323-objs := ip_conntrack_helper_h323.o ../../netfilter/nf_conntrack_h323_asn1.o -ip_nat_h323-objs := ip_nat_helper_h323.o # connection tracking -obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o -obj-$(CONFIG_IP_NF_NAT) += ip_nat.o obj-$(CONFIG_NF_NAT) += nf_nat.o -# conntrack netlink interface -obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o - - -# SCTP protocol connection tracking -obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o - -# connection tracking helpers -obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o -obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o -obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o -obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o -obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o -obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o -obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o -obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o - -# NAT helpers (ip_conntrack) -obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o -obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o -obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o -obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o -obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o -obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o -obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o - # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o @@ -78,7 +37,6 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o @@ -100,7 +58,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 5170f5c75f9..cae41215e3c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -166,13 +166,9 @@ static inline int arp_packet_match(const struct arphdr *arphdr, return 0; } - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - unsigned long odev; - memcpy(&odev, outdev + i*sizeof(unsigned long), - sizeof(unsigned long)); - ret |= (odev - ^ ((const unsigned long *)arpinfo->outiface)[i]) - & ((const unsigned long *)arpinfo->outiface_mask)[i]; + for (i = 0, ret = 0; i < IFNAMSIZ; i++) { + ret |= (outdev[i] ^ arpinfo->outiface[i]) + & arpinfo->outiface_mask[i]; } if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { @@ -249,7 +245,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); - arp = (*pskb)->nh.arph; + arp = arp_hdr(*pskb); do { if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { struct arpt_entry_target *t; @@ -301,7 +297,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, t->data); /* Target might have changed stuff. */ - arp = (*pskb)->nh.arph; + arp = arp_hdr(*pskb); if (verdict == ARPT_CONTINUE) e = (void *)e + e->next_offset; diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 709db4d3f48..6298d404e7c 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -30,35 +30,35 @@ target(struct sk_buff **pskb, *pskb = nskb; } - arp = (*pskb)->nh.arph; - arpptr = (*pskb)->nh.raw + sizeof(*arp); + arp = arp_hdr(*pskb); + arpptr = skb_network_header(*pskb) + sizeof(*arp); pln = arp->ar_pln; hln = arp->ar_hln; /* We assume that pln and hln were checked in the match */ if (mangle->flags & ARPT_MANGLE_SDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || - (arpptr + hln > (**pskb).tail)) + (arpptr + hln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, mangle->src_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_SIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || - (arpptr + pln > (**pskb).tail)) + (arpptr + pln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, &mangle->u_s.src_ip, pln); } arpptr += pln; if (mangle->flags & ARPT_MANGLE_TDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || - (arpptr + hln > (**pskb).tail)) + (arpptr + hln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, mangle->tgt_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_TIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || - (arpptr + pln > (**pskb).tail)) + (arpptr + pln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, &mangle->u_t.tgt_ip, pln); } diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c deleted file mode 100644 index 4f561f52c83..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ /dev/null @@ -1,229 +0,0 @@ -/* Amanda extension for IP connection tracking, Version 0.2 - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on HW's ip_conntrack_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Module load syntax: - * insmod ip_conntrack_amanda.o [master_timeout=n] - * - * Where master_timeout is the timeout (in seconds) of the master - * connection (port 10080). This defaults to 5 minutes but if - * your clients take longer than 5 minutes to do their work - * before getting back to the Amanda server, you can increase - * this value. - * - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/textsearch.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> - -static unsigned int master_timeout = 300; -static char *ts_algo = "kmp"; - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda connection tracking module"); -MODULE_LICENSE("GPL"); -module_param(master_timeout, uint, 0600); -MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); -module_param(ts_algo, charp, 0400); -MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); - -unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); - -enum amanda_strings { - SEARCH_CONNECT, - SEARCH_NEWLINE, - SEARCH_DATA, - SEARCH_MESG, - SEARCH_INDEX, -}; - -static struct { - char *string; - size_t len; - struct ts_config *ts; -} search[] = { - [SEARCH_CONNECT] = { - .string = "CONNECT ", - .len = 8, - }, - [SEARCH_NEWLINE] = { - .string = "\n", - .len = 1, - }, - [SEARCH_DATA] = { - .string = "DATA ", - .len = 5, - }, - [SEARCH_MESG] = { - .string = "MESG ", - .len = 5, - }, - [SEARCH_INDEX] = { - .string = "INDEX ", - .len = 6, - }, -}; - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - struct ts_state ts; - struct ip_conntrack_expect *exp; - unsigned int dataoff, start, stop, off, i; - char pbuf[sizeof("65535")], *tmp; - u_int16_t port, len; - int ret = NF_ACCEPT; - typeof(ip_nat_amanda_hook) ip_nat_amanda; - - /* Only look at packets from the Amanda server */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* increase the UDP timeout of the master connection as replies from - * Amanda clients to the server can be quite delayed */ - ip_ct_refresh(ct, *pskb, master_timeout * HZ); - - /* No data? */ - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) { - if (net_ratelimit()) - printk("amanda_help: skblen = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - - memset(&ts, 0, sizeof(ts)); - start = skb_find_text(*pskb, dataoff, (*pskb)->len, - search[SEARCH_CONNECT].ts, &ts); - if (start == UINT_MAX) - goto out; - start += dataoff + search[SEARCH_CONNECT].len; - - memset(&ts, 0, sizeof(ts)); - stop = skb_find_text(*pskb, start, (*pskb)->len, - search[SEARCH_NEWLINE].ts, &ts); - if (stop == UINT_MAX) - goto out; - stop += start; - - for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { - memset(&ts, 0, sizeof(ts)); - off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); - if (off == UINT_MAX) - continue; - off += start + search[i].len; - - len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); - if (skb_copy_bits(*pskb, off, pbuf, len)) - break; - pbuf[len] = '\0'; - - port = simple_strtoul(pbuf, &tmp, 10); - len = tmp - pbuf; - if (port == 0 || len > 5) - break; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - exp->expectfn = NULL; - exp->flags = 0; - - exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->tuple.dst.u.tcp.port = htons(port); - - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.protonum = 0xFF; - exp->mask.dst.u.tcp.port = htons(0xFFFF); - - /* RCU read locked by nf_hook_slow */ - ip_nat_amanda = rcu_dereference(ip_nat_amanda_hook); - if (ip_nat_amanda) - ret = ip_nat_amanda(pskb, ctinfo, off - dataoff, - len, exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - } - -out: - return ret; -} - -static struct ip_conntrack_helper amanda_helper = { - .max_expected = 3, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "amanda", - - .tuple = { .src = { .u = { .udp = {.port = __constant_htons(10080) } } }, - .dst = { .protonum = IPPROTO_UDP }, - }, - .mask = { .src = { .u = { 0xFFFF } }, - .dst = { .protonum = 0xFF }, - }, -}; - -static void __exit ip_conntrack_amanda_fini(void) -{ - int i; - - ip_conntrack_helper_unregister(&amanda_helper); - for (i = 0; i < ARRAY_SIZE(search); i++) - textsearch_destroy(search[i].ts); -} - -static int __init ip_conntrack_amanda_init(void) -{ - int ret, i; - - ret = -ENOMEM; - for (i = 0; i < ARRAY_SIZE(search); i++) { - search[i].ts = textsearch_prepare(ts_algo, search[i].string, - search[i].len, - GFP_KERNEL, TS_AUTOLOAD); - if (search[i].ts == NULL) - goto err; - } - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) - goto err; - return 0; - -err: - for (; i >= 0; i--) { - if (search[i].ts) - textsearch_destroy(search[i].ts); - } - return ret; -} - -module_init(ip_conntrack_amanda_init); -module_exit(ip_conntrack_amanda_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c deleted file mode 100644 index 23b99ae2cc3..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ /dev/null @@ -1,1550 +0,0 @@ -/* Connection state tracking for netfilter. This is separated from, - but required by, the NAT layer; it can also be used by an iptables - extension. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> - * - new API and handling of conntrack/nat helpers - * - now capable of multiple expectations for one master - * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> - * - add usage/reference counts to ip_conntrack_expect - * - export ip_conntrack[_expect]_{find_get,put} functions - * */ - -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/vmalloc.h> -#include <net/checksum.h> -#include <net/ip.h> -#include <linux/stddef.h> -#include <linux/sysctl.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/jhash.h> -#include <linux/err.h> -#include <linux/percpu.h> -#include <linux/moduleparam.h> -#include <linux/notifier.h> - -/* ip_conntrack_lock protects the main hash table, protocol/helper/expected - registrations, conntrack timers*/ -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#define IP_CONNTRACK_VERSION "2.4" - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -DEFINE_RWLOCK(ip_conntrack_lock); - -/* ip_conntrack_standalone needs this */ -atomic_t ip_conntrack_count = ATOMIC_INIT(0); - -void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; -LIST_HEAD(ip_conntrack_expect_list); -struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly; -static LIST_HEAD(helpers); -unsigned int ip_conntrack_htable_size __read_mostly = 0; -int ip_conntrack_max __read_mostly; -struct list_head *ip_conntrack_hash __read_mostly; -static struct kmem_cache *ip_conntrack_cachep __read_mostly; -static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly; -struct ip_conntrack ip_conntrack_untracked; -unsigned int ip_ct_log_invalid __read_mostly; -static LIST_HEAD(unconfirmed); -static int ip_conntrack_vmalloc __read_mostly; - -static unsigned int ip_conntrack_next_id; -static unsigned int ip_conntrack_expect_next_id; -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain); -ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain); - -DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); - -/* deliver cached events and clear cache entry - must be called with locally - * disabled softirqs */ -static inline void -__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) -{ - DEBUGP("ecache: delivering events for %p\n", ecache->ct); - if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) - atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events, - ecache->ct); - ecache->events = 0; - ip_conntrack_put(ecache->ct); - ecache->ct = NULL; -} - -/* Deliver all cached events for a particular conntrack. This is called - * by code prior to async packet handling or freeing the skb */ -void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) -{ - struct ip_conntrack_ecache *ecache; - - local_bh_disable(); - ecache = &__get_cpu_var(ip_conntrack_ecache); - if (ecache->ct == ct) - __ip_ct_deliver_cached_events(ecache); - local_bh_enable(); -} - -void __ip_ct_event_cache_init(struct ip_conntrack *ct) -{ - struct ip_conntrack_ecache *ecache; - - /* take care of delivering potentially old events */ - ecache = &__get_cpu_var(ip_conntrack_ecache); - BUG_ON(ecache->ct == ct); - if (ecache->ct) - __ip_ct_deliver_cached_events(ecache); - /* initialize for this conntrack/packet */ - ecache->ct = ct; - nf_conntrack_get(&ct->ct_general); -} - -/* flush the event cache - touches other CPU's data and must not be called while - * packets are still passing through the code */ -static void ip_ct_event_cache_flush(void) -{ - struct ip_conntrack_ecache *ecache; - int cpu; - - for_each_possible_cpu(cpu) { - ecache = &per_cpu(ip_conntrack_ecache, cpu); - if (ecache->ct) - ip_conntrack_put(ecache->ct); - } -} -#else -static inline void ip_ct_event_cache_flush(void) {} -#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ - -DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - -static int ip_conntrack_hash_rnd_initted; -static unsigned int ip_conntrack_hash_rnd; - -static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, - unsigned int size, unsigned int rnd) -{ - return (jhash_3words((__force u32)tuple->src.ip, - ((__force u32)tuple->dst.ip ^ tuple->dst.protonum), - (tuple->src.u.all | (tuple->dst.u.all << 16)), - rnd) % size); -} - -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) -{ - return __hash_conntrack(tuple, ip_conntrack_htable_size, - ip_conntrack_hash_rnd); -} - -int -ip_ct_get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol) -{ - /* Never happen */ - if (iph->frag_off & htons(IP_OFFSET)) { - printk("ip_conntrack_core: Frag of proto %u.\n", - iph->protocol); - return 0; - } - - tuple->src.ip = iph->saddr; - tuple->dst.ip = iph->daddr; - tuple->dst.protonum = iph->protocol; - tuple->dst.dir = IP_CT_DIR_ORIGINAL; - - return protocol->pkt_to_tuple(skb, dataoff, tuple); -} - -int -ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig, - const struct ip_conntrack_protocol *protocol) -{ - inverse->src.ip = orig->dst.ip; - inverse->dst.ip = orig->src.ip; - inverse->dst.protonum = orig->dst.protonum; - inverse->dst.dir = !orig->dst.dir; - - return protocol->invert_tuple(inverse, orig); -} - - -/* ip_conntrack_expect helper functions */ -void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) -{ - IP_NF_ASSERT(!timer_pending(&exp->timeout)); - list_del(&exp->list); - CONNTRACK_STAT_INC(expect_delete); - exp->master->expecting--; - ip_conntrack_expect_put(exp); -} - -static void expectation_timed_out(unsigned long ul_expect) -{ - struct ip_conntrack_expect *exp = (void *)ul_expect; - - write_lock_bh(&ip_conntrack_lock); - ip_ct_unlink_expect(exp); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_expect_put(exp); -} - -struct ip_conntrack_expect * -__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) - return i; - } - return NULL; -} - -/* Just find a expectation corresponding to a tuple. */ -struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - read_lock_bh(&ip_conntrack_lock); - i = __ip_conntrack_expect_find(tuple); - if (i) - atomic_inc(&i->use); - read_unlock_bh(&ip_conntrack_lock); - - return i; -} - -/* If an expectation for this connection is found, it gets delete from - * global list then returned. */ -static struct ip_conntrack_expect * -find_expectation(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) - && is_confirmed(i->master)) { - if (i->flags & IP_CT_EXPECT_PERMANENT) { - atomic_inc(&i->use); - return i; - } else if (del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - return i; - } - } - } - return NULL; -} - -/* delete all expectations for this conntrack */ -void ip_ct_remove_expectations(struct ip_conntrack *ct) -{ - struct ip_conntrack_expect *i, *tmp; - - /* Optimization: most connection never expect any others. */ - if (ct->expecting == 0) - return; - - list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { - if (i->master == ct && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - ip_conntrack_expect_put(i); - } - } -} - -static void -clean_from_lists(struct ip_conntrack *ct) -{ - DEBUGP("clean_from_lists(%p)\n", ct); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); - - /* Destroy all pending expectations */ - ip_ct_remove_expectations(ct); -} - -static void -destroy_conntrack(struct nf_conntrack *nfct) -{ - struct ip_conntrack *ct = (struct ip_conntrack *)nfct; - struct ip_conntrack_protocol *proto; - struct ip_conntrack_helper *helper; - typeof(ip_conntrack_destroyed) destroyed; - - DEBUGP("destroy_conntrack(%p)\n", ct); - IP_NF_ASSERT(atomic_read(&nfct->use) == 0); - IP_NF_ASSERT(!timer_pending(&ct->timeout)); - - ip_conntrack_event(IPCT_DESTROY, ct); - set_bit(IPS_DYING_BIT, &ct->status); - - helper = ct->helper; - if (helper && helper->destroy) - helper->destroy(ct); - - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to ip_conntrack_lock!!! -HW */ - rcu_read_lock(); - proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); - if (proto && proto->destroy) - proto->destroy(ct); - - destroyed = rcu_dereference(ip_conntrack_destroyed); - if (destroyed) - destroyed(ct); - - rcu_read_unlock(); - - write_lock_bh(&ip_conntrack_lock); - /* Expectations will have been removed in clean_from_lists, - * except TFTP can create an expectation on the first packet, - * before connection is in the list, so we need to clean here, - * too. */ - ip_ct_remove_expectations(ct); - - /* We overload first tuple to link into unconfirmed list. */ - if (!is_confirmed(ct)) { - BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - } - - CONNTRACK_STAT_INC(delete); - write_unlock_bh(&ip_conntrack_lock); - - if (ct->master) - ip_conntrack_put(ct->master); - - DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - ip_conntrack_free(ct); -} - -static void death_by_timeout(unsigned long ul_conntrack) -{ - struct ip_conntrack *ct = (void *)ul_conntrack; - - write_lock_bh(&ip_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - CONNTRACK_STAT_INC(delete_list); - clean_from_lists(ct); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_put(ct); -} - -struct ip_conntrack_tuple_hash * -__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - unsigned int hash = hash_conntrack(tuple); - - list_for_each_entry(h, &ip_conntrack_hash[hash], list) { - if (tuplehash_to_ctrack(h) != ignored_conntrack && - ip_ct_tuple_equal(tuple, &h->tuple)) { - CONNTRACK_STAT_INC(found); - return h; - } - CONNTRACK_STAT_INC(searched); - } - - return NULL; -} - -/* Find a connection corresponding to a tuple. */ -struct ip_conntrack_tuple_hash * -ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - - read_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); - if (h) - atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - read_unlock_bh(&ip_conntrack_lock); - - return h; -} - -static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, - unsigned int hash, - unsigned int repl_hash) -{ - ct->id = ++ip_conntrack_next_id; - list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, - &ip_conntrack_hash[hash]); - list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list, - &ip_conntrack_hash[repl_hash]); -} - -void ip_conntrack_hash_insert(struct ip_conntrack *ct) -{ - unsigned int hash, repl_hash; - - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - write_lock_bh(&ip_conntrack_lock); - __ip_conntrack_hash_insert(ct, hash, repl_hash); - write_unlock_bh(&ip_conntrack_lock); -} - -/* Confirm a connection given skb; places it in hash table */ -int -__ip_conntrack_confirm(struct sk_buff **pskb) -{ - unsigned int hash, repl_hash; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* ipt_REJECT uses ip_conntrack_attach to attach related - ICMP/TCP RST packets in other direction. Actual packet - which created connection will be IP_CT_NEW or for an - expected connection, IP_CT_RELATED. */ - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ - /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ - - /* No external references means noone else could have - confirmed us. */ - IP_NF_ASSERT(!is_confirmed(ct)); - DEBUGP("Confirming conntrack %p\n", ct); - - write_lock_bh(&ip_conntrack_lock); - - /* See if there's one in the list already, including reverse: - NAT could have grabbed it without realizing, since we're - not in the hash. If there is, we lost race. */ - list_for_each_entry(h, &ip_conntrack_hash[hash], list) - if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple)) - goto out; - list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list) - if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple)) - goto out; - - /* Remove from unconfirmed list */ - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - - __ip_conntrack_hash_insert(ct, hash, repl_hash); - /* Timer relative to confirmation time, not original - setting time, otherwise we'd get timer wrap in - weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); - atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); - CONNTRACK_STAT_INC(insert); - write_unlock_bh(&ip_conntrack_lock); - if (ct->helper) - ip_conntrack_event_cache(IPCT_HELPER, *pskb); -#ifdef CONFIG_IP_NF_NAT_NEEDED - if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || - test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - ip_conntrack_event_cache(IPCT_NATINFO, *pskb); -#endif - ip_conntrack_event_cache(master_ct(ct) ? - IPCT_RELATED : IPCT_NEW, *pskb); - - return NF_ACCEPT; - -out: - CONNTRACK_STAT_INC(insert_failed); - write_unlock_bh(&ip_conntrack_lock); - return NF_DROP; -} - -/* Returns true if a connection correspondings to the tuple (required - for NAT). */ -int -ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - - read_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); - read_unlock_bh(&ip_conntrack_lock); - - return h != NULL; -} - -/* There's a small race here where we may free a just-assured - connection. Too bad: we're in trouble anyway. */ -static int early_drop(struct list_head *chain) -{ - /* Traverse backwards: gives us oldest, which is roughly LRU */ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct = NULL, *tmp; - int dropped = 0; - - read_lock_bh(&ip_conntrack_lock); - list_for_each_entry_reverse(h, chain, list) { - tmp = tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) { - ct = tmp; - atomic_inc(&ct->ct_general.use); - break; - } - } - read_unlock_bh(&ip_conntrack_lock); - - if (!ct) - return dropped; - - if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - dropped = 1; - CONNTRACK_STAT_INC_ATOMIC(early_drop); - } - ip_conntrack_put(ct); - return dropped; -} - -static struct ip_conntrack_helper * -__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_helper *h; - - list_for_each_entry(h, &helpers, list) { - if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask)) - return h; - } - return NULL; -} - -struct ip_conntrack_helper * -ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_helper *helper; - - /* need ip_conntrack_lock to assure that helper exists until - * try_module_get() is called */ - read_lock_bh(&ip_conntrack_lock); - - helper = __ip_conntrack_helper_find(tuple); - if (helper) { - /* need to increase module usage count to assure helper will - * not go away while the caller is e.g. busy putting a - * conntrack in the hash that uses the helper */ - if (!try_module_get(helper->me)) - helper = NULL; - } - - read_unlock_bh(&ip_conntrack_lock); - - return helper; -} - -void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) -{ - module_put(helper->me); -} - -struct ip_conntrack_protocol * -__ip_conntrack_proto_find(u_int8_t protocol) -{ - return ip_ct_protos[protocol]; -} - -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -struct ip_conntrack_protocol * -ip_conntrack_proto_find_get(u_int8_t protocol) -{ - struct ip_conntrack_protocol *p; - - rcu_read_lock(); - p = __ip_conntrack_proto_find(protocol); - if (p) { - if (!try_module_get(p->me)) - p = &ip_conntrack_generic_protocol; - } - rcu_read_unlock(); - - return p; -} - -void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) -{ - module_put(p->me); -} - -struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, - struct ip_conntrack_tuple *repl) -{ - struct ip_conntrack *conntrack; - - if (!ip_conntrack_hash_rnd_initted) { - get_random_bytes(&ip_conntrack_hash_rnd, 4); - ip_conntrack_hash_rnd_initted = 1; - } - - /* We don't want any race condition at early drop stage */ - atomic_inc(&ip_conntrack_count); - - if (ip_conntrack_max - && atomic_read(&ip_conntrack_count) > ip_conntrack_max) { - unsigned int hash = hash_conntrack(orig); - /* Try dropping from this hash chain. */ - if (!early_drop(&ip_conntrack_hash[hash])) { - atomic_dec(&ip_conntrack_count); - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: table full, dropping" - " packet.\n"); - return ERR_PTR(-ENOMEM); - } - } - - conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC); - if (!conntrack) { - DEBUGP("Can't allocate conntrack.\n"); - atomic_dec(&ip_conntrack_count); - return ERR_PTR(-ENOMEM); - } - - atomic_set(&conntrack->ct_general.use, 1); - conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; - /* Don't set timer yet: wait for confirmation */ - init_timer(&conntrack->timeout); - conntrack->timeout.data = (unsigned long)conntrack; - conntrack->timeout.function = death_by_timeout; - - return conntrack; -} - -void -ip_conntrack_free(struct ip_conntrack *conntrack) -{ - atomic_dec(&ip_conntrack_count); - kmem_cache_free(ip_conntrack_cachep, conntrack); -} - -/* Allocate a new conntrack: we return -ENOMEM if classification - * failed due to stress. Otherwise it really is unclassifiable */ -static struct ip_conntrack_tuple_hash * -init_conntrack(struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) -{ - struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - struct ip_conntrack_expect *exp; - - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - - conntrack = ip_conntrack_alloc(tuple, &repl_tuple); - if (conntrack == NULL || IS_ERR(conntrack)) - return (struct ip_conntrack_tuple_hash *)conntrack; - - if (!protocol->new(conntrack, skb)) { - ip_conntrack_free(conntrack); - return NULL; - } - - write_lock_bh(&ip_conntrack_lock); - exp = find_expectation(tuple); - - if (exp) { - DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = exp->master; -#ifdef CONFIG_IP_NF_CONNTRACK_MARK - conntrack->mark = exp->master->mark; -#endif -#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) - /* this is ugly, but there is no other place where to put it */ - conntrack->nat.masq_index = exp->master->nat.masq_index; -#endif -#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK - conntrack->secmark = exp->master->secmark; -#endif - nf_conntrack_get(&conntrack->master->ct_general); - CONNTRACK_STAT_INC(expect_new); - } else { - conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); - - CONNTRACK_STAT_INC(new); - } - - /* Overload tuple linked list to put us in unconfirmed list. */ - list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - - write_unlock_bh(&ip_conntrack_lock); - - if (exp) { - if (exp->expectfn) - exp->expectfn(conntrack, exp); - ip_conntrack_expect_put(exp); - } - - return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; -} - -/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ -static inline struct ip_conntrack * -resolve_normal_ct(struct sk_buff *skb, - struct ip_conntrack_protocol *proto, - int *set_reply, - unsigned int hooknum, - enum ip_conntrack_info *ctinfo) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - - IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - - if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, - &tuple,proto)) - return NULL; - - /* look for tuple match */ - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) { - h = init_conntrack(&tuple, proto, skb); - if (!h) - return NULL; - if (IS_ERR(h)) - return (void *)h; - } - ct = tuplehash_to_ctrack(h); - - /* It exists; we have (non-exclusive) reference. */ - if (DIRECTION(h) == IP_CT_DIR_REPLY) { - *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; - /* Please set reply bit if this packet OK */ - *set_reply = 1; - } else { - /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - DEBUGP("ip_conntrack_in: normal packet for %p\n", - ct); - *ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - DEBUGP("ip_conntrack_in: related packet for %p\n", - ct); - *ctinfo = IP_CT_RELATED; - } else { - DEBUGP("ip_conntrack_in: new packet for %p\n", - ct); - *ctinfo = IP_CT_NEW; - } - *set_reply = 0; - } - skb->nfct = &ct->ct_general; - skb->nfctinfo = *ctinfo; - return ct; -} - -/* Netfilter hook itself. */ -unsigned int ip_conntrack_in(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - struct ip_conntrack_protocol *proto; - int set_reply = 0; - int ret; - - /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) { - CONNTRACK_STAT_INC_ATOMIC(ignore); - return NF_ACCEPT; - } - - /* Never happen */ - if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { - if (net_ratelimit()) { - printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", - (*pskb)->nh.iph->protocol, hooknum); - } - return NF_DROP; - } - -/* Doesn't cover locally-generated broadcast, so not worth it. */ -#if 0 - /* Ignore broadcast: no `connection'. */ - if ((*pskb)->pkt_type == PACKET_BROADCAST) { - printk("Broadcast packet!\n"); - return NF_ACCEPT; - } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) - == htonl(0x000000FF)) { - printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), - (*pskb)->sk, (*pskb)->pkt_type); - } -#endif - - /* rcu_read_lock()ed by nf_hook_slow */ - proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); - - /* It may be an special packet, error, unclean... - * inverse of the return code tells to the netfilter - * core what to do with the packet. */ - if (proto->error != NULL - && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { - CONNTRACK_STAT_INC_ATOMIC(error); - CONNTRACK_STAT_INC_ATOMIC(invalid); - return -ret; - } - - if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { - /* Not valid part of a connection */ - CONNTRACK_STAT_INC_ATOMIC(invalid); - return NF_ACCEPT; - } - - if (IS_ERR(ct)) { - /* Too stressed to deal. */ - CONNTRACK_STAT_INC_ATOMIC(drop); - return NF_DROP; - } - - IP_NF_ASSERT((*pskb)->nfct); - - ret = proto->packet(ct, *pskb, ctinfo); - if (ret < 0) { - /* Invalid: inverse of the return code tells - * the netfilter core what to do*/ - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - CONNTRACK_STAT_INC_ATOMIC(invalid); - return -ret; - } - - if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - ip_conntrack_event_cache(IPCT_STATUS, *pskb); - - return ret; -} - -int invert_tuplepr(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig) -{ - struct ip_conntrack_protocol *proto; - int ret; - - rcu_read_lock(); - proto = __ip_conntrack_proto_find(orig->dst.protonum); - ret = ip_ct_invert_tuple(inverse, orig, proto); - rcu_read_unlock(); - - return ret; -} - -/* Would two expected things clash? */ -static inline int expect_clash(const struct ip_conntrack_expect *a, - const struct ip_conntrack_expect *b) -{ - /* Part covered by intersection of masks must be unequal, - otherwise they clash */ - struct ip_conntrack_tuple intersect_mask - = { { a->mask.src.ip & b->mask.src.ip, - { a->mask.src.u.all & b->mask.src.u.all } }, - { a->mask.dst.ip & b->mask.dst.ip, - { a->mask.dst.u.all & b->mask.dst.u.all }, - a->mask.dst.protonum & b->mask.dst.protonum } }; - - return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); -} - -static inline int expect_matches(const struct ip_conntrack_expect *a, - const struct ip_conntrack_expect *b) -{ - return a->master == b->master - && ip_ct_tuple_equal(&a->tuple, &b->tuple) - && ip_ct_tuple_equal(&a->mask, &b->mask); -} - -/* Generally a bad idea to call this: could have matched already. */ -void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) -{ - struct ip_conntrack_expect *i; - - write_lock_bh(&ip_conntrack_lock); - /* choose the the oldest expectation to evict */ - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { - if (expect_matches(i, exp) && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_expect_put(i); - return; - } - } - write_unlock_bh(&ip_conntrack_lock); -} - -/* We don't increase the master conntrack refcount for non-fulfilled - * conntracks. During the conntrack destruction, the expectations are - * always killed before the conntrack itself */ -struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) -{ - struct ip_conntrack_expect *new; - - new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); - if (!new) { - DEBUGP("expect_related: OOM allocating expect\n"); - return NULL; - } - new->master = me; - atomic_set(&new->use, 1); - return new; -} - -void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) -{ - if (atomic_dec_and_test(&exp->use)) - kmem_cache_free(ip_conntrack_expect_cachep, exp); -} - -static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) -{ - atomic_inc(&exp->use); - exp->master->expecting++; - list_add(&exp->list, &ip_conntrack_expect_list); - - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; - exp->timeout.function = expectation_timed_out; - exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; - add_timer(&exp->timeout); - - exp->id = ++ip_conntrack_expect_next_id; - atomic_inc(&exp->use); - CONNTRACK_STAT_INC(expect_create); -} - -/* Race with expectations being used means we could have none to find; OK. */ -static void evict_oldest_expect(struct ip_conntrack *master) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { - if (i->master == master) { - if (del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - ip_conntrack_expect_put(i); - } - break; - } - } -} - -static inline int refresh_timer(struct ip_conntrack_expect *i) -{ - if (!del_timer(&i->timeout)) - return 0; - - i->timeout.expires = jiffies + i->master->helper->timeout*HZ; - add_timer(&i->timeout); - return 1; -} - -int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) -{ - struct ip_conntrack_expect *i; - int ret; - - DEBUGP("ip_conntrack_expect_related %p\n", related_to); - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - - write_lock_bh(&ip_conntrack_lock); - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; - } - } else if (expect_clash(i, expect)) { - ret = -EBUSY; - goto out; - } - } - - /* Will be over limit? */ - if (expect->master->helper->max_expected && - expect->master->expecting >= expect->master->helper->max_expected) - evict_oldest_expect(expect->master); - - ip_conntrack_expect_insert(expect); - ip_conntrack_expect_event(IPEXP_NEW, expect); - ret = 0; -out: - write_unlock_bh(&ip_conntrack_lock); - return ret; -} - -/* Alter reply tuple (maybe alter helper). This is for NAT, and is - implicitly racy: see __ip_conntrack_confirm */ -void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, - const struct ip_conntrack_tuple *newreply) -{ - write_lock_bh(&ip_conntrack_lock); - /* Should be unconfirmed, so not in hash table yet */ - IP_NF_ASSERT(!is_confirmed(conntrack)); - - DEBUGP("Altering reply tuple of %p to ", conntrack); - DUMP_TUPLE(newreply); - - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = __ip_conntrack_helper_find(newreply); - write_unlock_bh(&ip_conntrack_lock); -} - -int ip_conntrack_helper_register(struct ip_conntrack_helper *me) -{ - BUG_ON(me->timeout == 0); - write_lock_bh(&ip_conntrack_lock); - list_add(&me->list, &helpers); - write_unlock_bh(&ip_conntrack_lock); - - return 0; -} - -struct ip_conntrack_helper * -__ip_conntrack_helper_find_byname(const char *name) -{ - struct ip_conntrack_helper *h; - - list_for_each_entry(h, &helpers, list) { - if (!strcmp(h->name, name)) - return h; - } - - return NULL; -} - -static inline void unhelp(struct ip_conntrack_tuple_hash *i, - const struct ip_conntrack_helper *me) -{ - if (tuplehash_to_ctrack(i)->helper == me) { - ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); - tuplehash_to_ctrack(i)->helper = NULL; - } -} - -void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) -{ - unsigned int i; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_expect *exp, *tmp; - - /* Need write lock here, to delete helper. */ - write_lock_bh(&ip_conntrack_lock); - list_del(&me->list); - - /* Get rid of expectations */ - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { - if (exp->master->helper == me && del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - /* Get rid of expecteds, set helpers to NULL. */ - list_for_each_entry(h, &unconfirmed, list) - unhelp(h, me); - for (i = 0; i < ip_conntrack_htable_size; i++) { - list_for_each_entry(h, &ip_conntrack_hash[i], list) - unhelp(h, me); - } - write_unlock_bh(&ip_conntrack_lock); - - /* Someone could be still looking at the helper in a bh. */ - synchronize_net(); -} - -/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ -void __ip_ct_refresh_acct(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - unsigned long extra_jiffies, - int do_acct) -{ - int event = 0; - - IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); - IP_NF_ASSERT(skb); - - write_lock_bh(&ip_conntrack_lock); - - /* Only update if this is not a fixed timeout */ - if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { - write_unlock_bh(&ip_conntrack_lock); - return; - } - - /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) { - ct->timeout.expires = extra_jiffies; - event = IPCT_REFRESH; - } else { - /* Need del_timer for race avoidance (may already be dying). */ - if (del_timer(&ct->timeout)) { - ct->timeout.expires = jiffies + extra_jiffies; - add_timer(&ct->timeout); - event = IPCT_REFRESH; - } - } - -#ifdef CONFIG_IP_NF_CT_ACCT - if (do_acct) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - ntohs(skb->nh.iph->tot_len); - if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) - || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) - event |= IPCT_COUNTER_FILLING; - } -#endif - - write_unlock_bh(&ip_conntrack_lock); - - /* must be unlocked when calling event cache */ - if (event) - ip_conntrack_event_cache(event, skb); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be - * in ip_conntrack_core, since we don't want the protocols to autoload - * or depend on ctnetlink */ -int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16), - &tuple->src.u.tcp.port); - NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16), - &tuple->dst.u.tcp.port); - return 0; - -nfattr_failure: - return -1; -} - -int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], - struct ip_conntrack_tuple *t) -{ - if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) - return -EINVAL; - - t->src.u.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); - t->dst.u.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); - - return 0; -} -#endif - -/* Returns new sk_buff, or NULL */ -struct sk_buff * -ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - skb_orphan(skb); - - local_bh_disable(); - skb = ip_defrag(skb, user); - local_bh_enable(); - - if (skb) - ip_send_check(skb->nh.iph); - return skb; -} - -/* Used by ipt_REJECT. */ -static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - /* This ICMP is in reverse direction to the packet which caused it */ - ct = ip_conntrack_get(skb, &ctinfo); - - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; - else - ctinfo = IP_CT_RELATED; - - /* Attach to new skbuff, and increment count */ - nskb->nfct = &ct->ct_general; - nskb->nfctinfo = ctinfo; - nf_conntrack_get(nskb->nfct); -} - -/* Bring out ya dead! */ -static struct ip_conntrack * -get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), - void *data, unsigned int *bucket) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - - write_lock_bh(&ip_conntrack_lock); - for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { - list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) { - ct = tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } - } - list_for_each_entry(h, &unconfirmed, list) { - ct = tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - write_unlock_bh(&ip_conntrack_lock); - return NULL; - -found: - atomic_inc(&ct->ct_general.use); - write_unlock_bh(&ip_conntrack_lock); - return ct; -} - -void -ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) -{ - struct ip_conntrack *ct; - unsigned int bucket = 0; - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); - /* ... else the timer will get him soon. */ - - ip_conntrack_put(ct); - } -} - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - struct inet_sock *inet = inet_sk(sk); - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - - IP_CT_TUPLE_U_BLANK(&tuple); - tuple.src.ip = inet->rcv_saddr; - tuple.src.u.tcp.port = inet->sport; - tuple.dst.ip = inet->daddr; - tuple.dst.u.tcp.port = inet->dport; - tuple.dst.protonum = IPPROTO_TCP; - - /* We only do TCP at the moment: is there a better way? */ - if (strcmp(sk->sk_prot->name, "TCP")) { - DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = ip_conntrack_find_get(&tuple, NULL); - if (h) { - struct sockaddr_in sin; - struct ip_conntrack *ct = tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", - NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - ip_conntrack_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", - NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), - NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = &getorigdst, -}; - -static int kill_all(struct ip_conntrack *i, void *data) -{ - return 1; -} - -void ip_conntrack_flush(void) -{ - ip_ct_iterate_cleanup(kill_all, NULL); -} - -static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) -{ - if (vmalloced) - vfree(hash); - else - free_pages((unsigned long)hash, - get_order(sizeof(struct list_head) * size)); -} - -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - rcu_assign_pointer(ip_ct_attach, NULL); - - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ - synchronize_net(); - - ip_ct_event_cache_flush(); - i_see_dead_people: - ip_conntrack_flush(); - if (atomic_read(&ip_conntrack_count) != 0) { - schedule(); - goto i_see_dead_people; - } - /* wait until all references to ip_conntrack_untracked are dropped */ - while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) - schedule(); - - kmem_cache_destroy(ip_conntrack_cachep); - kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, - ip_conntrack_htable_size); - nf_unregister_sockopt(&so_getorigdst); -} - -static struct list_head *alloc_hashtable(int size, int *vmalloced) -{ - struct list_head *hash; - unsigned int i; - - *vmalloced = 0; - hash = (void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - * size)); - if (!hash) { - *vmalloced = 1; - printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct list_head) * size); - } - - if (hash) - for (i = 0; i < size; i++) - INIT_LIST_HEAD(&hash[i]); - - return hash; -} - -static int set_hashsize(const char *val, struct kernel_param *kp) -{ - int i, bucket, hashsize, vmalloced; - int old_vmalloced, old_size; - int rnd; - struct list_head *hash, *old_hash; - struct ip_conntrack_tuple_hash *h; - - /* On boot, we can set this without any fancy locking. */ - if (!ip_conntrack_htable_size) - return param_set_int(val, kp); - - hashsize = simple_strtol(val, NULL, 0); - if (!hashsize) - return -EINVAL; - - hash = alloc_hashtable(hashsize, &vmalloced); - if (!hash) - return -ENOMEM; - - /* We have to rehash for the new table anyway, so we also can - * use a new random seed */ - get_random_bytes(&rnd, 4); - - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) { - while (!list_empty(&ip_conntrack_hash[i])) { - h = list_entry(ip_conntrack_hash[i].next, - struct ip_conntrack_tuple_hash, list); - list_del(&h->list); - bucket = __hash_conntrack(&h->tuple, hashsize, rnd); - list_add_tail(&h->list, &hash[bucket]); - } - } - old_size = ip_conntrack_htable_size; - old_vmalloced = ip_conntrack_vmalloc; - old_hash = ip_conntrack_hash; - - ip_conntrack_htable_size = hashsize; - ip_conntrack_vmalloc = vmalloced; - ip_conntrack_hash = hash; - ip_conntrack_hash_rnd = rnd; - write_unlock_bh(&ip_conntrack_lock); - - free_conntrack_hash(old_hash, old_vmalloced, old_size); - return 0; -} - -module_param_call(hashsize, set_hashsize, param_get_uint, - &ip_conntrack_htable_size, 0600); - -int __init ip_conntrack_init(void) -{ - unsigned int i; - int ret; - - /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB - * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (!ip_conntrack_htable_size) { - ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - ip_conntrack_htable_size = 8192; - if (ip_conntrack_htable_size < 16) - ip_conntrack_htable_size = 16; - } - ip_conntrack_max = 8 * ip_conntrack_htable_size; - - printk("ip_conntrack version %s (%u buckets, %d max)" - " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, - ip_conntrack_htable_size, ip_conntrack_max, - sizeof(struct ip_conntrack)); - - ret = nf_register_sockopt(&so_getorigdst); - if (ret != 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); - return ret; - } - - ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, - &ip_conntrack_vmalloc); - if (!ip_conntrack_hash) { - printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); - goto err_unreg_sockopt; - } - - ip_conntrack_cachep = kmem_cache_create("ip_conntrack", - sizeof(struct ip_conntrack), 0, - 0, NULL, NULL); - if (!ip_conntrack_cachep) { - printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); - goto err_free_hash; - } - - ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", - sizeof(struct ip_conntrack_expect), - 0, 0, NULL, NULL); - if (!ip_conntrack_expect_cachep) { - printk(KERN_ERR "Unable to create ip_expect slab cache\n"); - goto err_free_conntrack_slab; - } - - /* Don't NEED lock here, but good form anyway. */ - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < MAX_IP_CT_PROTO; i++) - rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol); - /* Sew in builtin protocols. */ - rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp); - rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp); - rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp); - write_unlock_bh(&ip_conntrack_lock); - - /* For use by ipt_REJECT */ - rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach); - - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&ip_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); - - return ret; - -err_free_conntrack_slab: - kmem_cache_destroy(ip_conntrack_cachep); -err_free_hash: - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, - ip_conntrack_htable_size); -err_unreg_sockopt: - nf_unregister_sockopt(&so_getorigdst); - - return -ENOMEM; -} diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c deleted file mode 100644 index 1faa68ab943..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ /dev/null @@ -1,520 +0,0 @@ -/* FTP extension for IP connection tracking. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/ctype.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> -#include <linux/moduleparam.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp connection tracking helper"); - -/* This is slow, but it's simple. --RR */ -static char *ftp_buffer; -static DEFINE_SPINLOCK(ip_ftp_lock); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); - -static int loose; -module_param(loose, bool, 0600); - -unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq); -EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int try_rfc959(const char *, size_t, u_int32_t [], char); -static int try_eprt(const char *, size_t, u_int32_t [], char); -static int try_epsv_response(const char *, size_t, u_int32_t [], char); - -static const struct ftp_search { - const char *pattern; - size_t plen; - char skip; - char term; - enum ip_ct_ftp_type ftptype; - int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[IP_CT_DIR_MAX][2] = { - [IP_CT_DIR_ORIGINAL] = { - { - .pattern = "PORT", - .plen = sizeof("PORT") - 1, - .skip = ' ', - .term = '\r', - .ftptype = IP_CT_FTP_PORT, - .getnum = try_rfc959, - }, - { - .pattern = "EPRT", - .plen = sizeof("EPRT") - 1, - .skip = ' ', - .term = '\r', - .ftptype = IP_CT_FTP_EPRT, - .getnum = try_eprt, - }, - }, - [IP_CT_DIR_REPLY] = { - { - .pattern = "227 ", - .plen = sizeof("227 ") - 1, - .skip = '(', - .term = ')', - .ftptype = IP_CT_FTP_PASV, - .getnum = try_rfc959, - }, - { - .pattern = "229 ", - .plen = sizeof("229 ") - 1, - .skip = '(', - .term = ')', - .ftptype = IP_CT_FTP_EPSV, - .getnum = try_epsv_response, - }, - }, -}; - -static int try_number(const char *data, size_t dlen, u_int32_t array[], - int array_size, char sep, char term) -{ - u_int32_t i, len; - - memset(array, 0, sizeof(array[0])*array_size); - - /* Keep data pointing at next char. */ - for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { - if (*data >= '0' && *data <= '9') { - array[i] = array[i]*10 + *data - '0'; - } - else if (*data == sep) - i++; - else { - /* Unexpected character; true if it's the - terminator and we're finished. */ - if (*data == term && i == array_size - 1) - return len; - - DEBUGP("Char %u (got %u nums) `%u' unexpected\n", - len, i, *data); - return 0; - } - } - DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); - - return 0; -} - -/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ -static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - return try_number(data, dlen, array, 6, ',', term); -} - -/* Grab port: number up to delimiter */ -static int get_port(const char *data, int start, size_t dlen, char delim, - u_int32_t array[2]) -{ - u_int16_t port = 0; - int i; - - for (i = start; i < dlen; i++) { - /* Finished? */ - if (data[i] == delim) { - if (port == 0) - break; - array[0] = port >> 8; - array[1] = port; - return i + 1; - } - else if (data[i] >= '0' && data[i] <= '9') - port = port*10 + data[i] - '0'; - else /* Some other crap */ - break; - } - return 0; -} - -/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */ -static int try_eprt(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - char delim; - int length; - - /* First character is delimiter, then "1" for IPv4, then - delimiter again. */ - if (dlen <= 3) return 0; - delim = data[0]; - if (isdigit(delim) || delim < 33 || delim > 126 - || data[1] != '1' || data[2] != delim) - return 0; - - DEBUGP("EPRT: Got |1|!\n"); - /* Now we have IP address. */ - length = try_number(data + 3, dlen - 3, array, 4, '.', delim); - if (length == 0) - return 0; - - DEBUGP("EPRT: Got IP address!\n"); - /* Start offset includes initial "|1|", and trailing delimiter */ - return get_port(data, 3 + length + 1, dlen, delim, array+4); -} - -/* Returns 0, or length of numbers: |||6446| */ -static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - char delim; - - /* Three delimiters. */ - if (dlen <= 3) return 0; - delim = data[0]; - if (isdigit(delim) || delim < 33 || delim > 126 - || data[1] != delim || data[2] != delim) - return 0; - - return get_port(data, 3, dlen, delim, array+4); -} - -/* Return 1 for match, 0 for accept, -1 for partial. */ -static int find_pattern(const char *data, size_t dlen, - const char *pattern, size_t plen, - char skip, char term, - unsigned int *numoff, - unsigned int *numlen, - u_int32_t array[6], - int (*getnum)(const char *, size_t, u_int32_t[], char)) -{ - size_t i; - - DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); - if (dlen == 0) - return 0; - - if (dlen <= plen) { - /* Short packet: try for partial? */ - if (strnicmp(data, pattern, dlen) == 0) - return -1; - else return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { -#if 0 - size_t i; - - DEBUGP("ftp: string mismatch\n"); - for (i = 0; i < plen; i++) { - DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", - i, data[i], data[i], - pattern[i], pattern[i]); - } -#endif - return 0; - } - - DEBUGP("Pattern matches!\n"); - /* Now we've found the constant string, try to skip - to the 'skip' character */ - for (i = plen; data[i] != skip; i++) - if (i == dlen - 1) return -1; - - /* Skip over the last character */ - i++; - - DEBUGP("Skipped up to `%c'!\n", skip); - - *numoff = i; - *numlen = getnum(data + i, dlen - i, array, term); - if (!*numlen) - return -1; - - DEBUGP("Match succeeded!\n"); - return 1; -} - -/* Look up to see if we're just after a \n. */ -static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) -{ - unsigned int i; - - for (i = 0; i < info->seq_aft_nl_num[dir]; i++) - if (info->seq_aft_nl[dir][i] == seq) - return 1; - return 0; -} - -/* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, - struct sk_buff *skb) -{ - unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; - - /* Look for oldest: if we find exact match, we're done. */ - for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { - if (info->seq_aft_nl[dir][i] == nl_seq) - return; - - if (oldest == info->seq_aft_nl_num[dir] - || before(info->seq_aft_nl[dir][i], oldest)) - oldest = i; - } - - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { - info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); - } else if (oldest != NUM_SEQ_TO_REMEMBER) { - info->seq_aft_nl[dir][oldest] = nl_seq; - ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); - } -} - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff, datalen; - struct tcphdr _tcph, *th; - char *fb_ptr; - int ret; - u32 seq, array[6] = { 0 }; - int dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff; - struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; - struct ip_conntrack_expect *exp; - unsigned int i; - int found = 0, ends_in_nl; - typeof(ip_nat_ftp_hook) ip_nat_ftp; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); - return NF_ACCEPT; - } - - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return NF_ACCEPT; - - dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; - /* No data? */ - if (dataoff >= (*pskb)->len) { - DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - datalen = (*pskb)->len - dataoff; - - spin_lock_bh(&ip_ftp_lock); - fb_ptr = skb_header_pointer(*pskb, dataoff, - (*pskb)->len - dataoff, ftp_buffer); - BUG_ON(fb_ptr == NULL); - - ends_in_nl = (fb_ptr[datalen - 1] == '\n'); - seq = ntohl(th->seq) + datalen; - - /* Look up to see if we're just after a \n. */ - if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { - /* Now if this ends in \n, update ftp info. */ - DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", - ct_ftp_info->seq_aft_nl[0][dir] - old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); - ret = NF_ACCEPT; - goto out_update_nl; - } - - /* Initialize IP array to expected address (it's not mentioned - in EPSV responses) */ - array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; - array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; - array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; - array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - - for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { - found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[dir][i].pattern, - search[dir][i].plen, - search[dir][i].skip, - search[dir][i].term, - &matchoff, &matchlen, - array, - search[dir][i].getnum); - if (found) break; - } - if (found == -1) { - /* We don't usually drop packets. After all, this is - connection tracking, not packet filtering. - However, it is necessary for accurate tracking in - this case. */ - if (net_ratelimit()) - printk("conntrack_ftp: partial %s %u+%u\n", - search[dir][i].pattern, - ntohl(th->seq), datalen); - ret = NF_DROP; - goto out; - } else if (found == 0) { /* No match */ - ret = NF_ACCEPT; - goto out_update_nl; - } - - DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", - fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); - - /* Allocate expectation which will be inserted */ - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - /* We refer to the reverse direction ("!dir") tuples here, - * because we're expecting something in the other direction. - * Doesn't matter unless NAT is happening. */ - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - - if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) - != ct->tuplehash[dir].tuple.src.ip) { - /* Enrico Scholz's passive FTP to partially RNAT'd ftp - server: it really wants us to connect to a - different IP address. Simply don't record it for - NAT. */ - DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", - array[0], array[1], array[2], array[3], - NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); - - /* Thanks to Cristiano Lincoln Mattos - <lincoln@cesar.org.br> for reporting this potential - problem (DMZ machines opening holes to internal - networks, or the packet filter itself). */ - if (!loose) { - ret = NF_ACCEPT; - goto out_put_expect; - } - exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) - | (array[2] << 8) | array[3]); - } - - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); - exp->tuple.src.u.tcp.port = 0; /* Don't care. */ - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask = ((struct ip_conntrack_tuple) - { { htonl(0xFFFFFFFF), { 0 } }, - { htonl(0xFFFFFFFF), { .tcp = { htons(0xFFFF) } }, 0xFF }}); - - exp->expectfn = NULL; - exp->flags = 0; - - /* Now, NAT might want to mangle the packet, and register the - * (possibly changed) expectation itself. */ - ip_nat_ftp = rcu_dereference(ip_nat_ftp_hook); - if (ip_nat_ftp) - ret = ip_nat_ftp(pskb, ctinfo, search[dir][i].ftptype, - matchoff, matchlen, exp, &seq); - else { - /* Can't expect this? Best to drop packet now. */ - if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - else - ret = NF_ACCEPT; - } - -out_put_expect: - ip_conntrack_expect_put(exp); - -out_update_nl: - /* Now if this ends in \n, update ftp info. Seq may have been - * adjusted by NAT code. */ - if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir, *pskb); - out: - spin_unlock_bh(&ip_ftp_lock); - return ret; -} - -static struct ip_conntrack_helper ftp[MAX_PORTS]; -static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; - -/* Not __exit: called from init() */ -static void ip_conntrack_ftp_fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&ftp[i]); - } - - kfree(ftp_buffer); -} - -static int __init ip_conntrack_ftp_init(void) -{ - int i, ret; - char *tmpname; - - ftp_buffer = kmalloc(65536, GFP_KERNEL); - if (!ftp_buffer) - return -ENOMEM; - - if (ports_c == 0) - ports[ports_c++] = FTP_PORT; - - for (i = 0; i < ports_c; i++) { - ftp[i].tuple.src.u.tcp.port = htons(ports[i]); - ftp[i].tuple.dst.protonum = IPPROTO_TCP; - ftp[i].mask.src.u.tcp.port = htons(0xFFFF); - ftp[i].mask.dst.protonum = 0xFF; - ftp[i].max_expected = 1; - ftp[i].timeout = 5 * 60; /* 5 minutes */ - ftp[i].me = THIS_MODULE; - ftp[i].help = help; - - tmpname = &ftp_names[i][0]; - if (ports[i] == FTP_PORT) - sprintf(tmpname, "ftp"); - else - sprintf(tmpname, "ftp-%d", ports[i]); - ftp[i].name = tmpname; - - DEBUGP("ip_ct_ftp: registering helper for port %d\n", - ports[i]); - ret = ip_conntrack_helper_register(&ftp[i]); - - if (ret) { - ip_conntrack_ftp_fini(); - return ret; - } - } - return 0; -} - -module_init(ip_conntrack_ftp_init); -module_exit(ip_conntrack_ftp_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c deleted file mode 100644 index 53eb365ccc7..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ /dev/null @@ -1,1841 +0,0 @@ -/* - * H.323 connection tracking helper - * - * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> - * - * This source code is licensed under General Public License version 2. - * - * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * - * For more information, please see http://nath323.sourceforge.net/ - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> -#include <linux/netfilter_ipv4/ip_conntrack_h323.h> -#include <linux/moduleparam.h> -#include <linux/ctype.h> -#include <linux/inet.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/* Parameters */ -static unsigned int default_rrq_ttl = 300; -module_param(default_rrq_ttl, uint, 0600); -MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ"); - -static int gkrouted_only = 1; -module_param(gkrouted_only, int, 0600); -MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); - -static int callforward_filter = 1; -module_param(callforward_filter, bool, 0600); -MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " - "if both endpoints are on different sides " - "(determined by routing information)"); - -/* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff ** pskb, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - __be32 ip, u_int16_t port); -int (*set_h225_addr_hook) (struct sk_buff ** pskb, - unsigned char **data, int dataoff, - TransportAddress * addr, - __be32 ip, u_int16_t port); -int (*set_sig_addr_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count); -int (*set_ras_addr_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count); -int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - u_int16_t port, u_int16_t rtp_port, - struct ip_conntrack_expect * rtp_exp, - struct ip_conntrack_expect * rtcp_exp); -int (*nat_t120_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_h245_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_callforwarding_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_q931_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, TransportAddress * addr, int idx, - u_int16_t port, struct ip_conntrack_expect * exp); - - -static DEFINE_SPINLOCK(ip_h323_lock); -static char *h323_buffer; - -/****************************************************************************/ -static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int *datalen, int *dataoff) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - struct tcphdr _tcph, *th; - int tcpdatalen; - int tcpdataoff; - unsigned char *tpkt; - int tpktlen; - int tpktoff; - - /* Get TCP header */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - /* Get TCP data offset */ - tcpdataoff = (*pskb)->nh.iph->ihl * 4 + th->doff * 4; - - /* Get TCP data length */ - tcpdatalen = (*pskb)->len - tcpdataoff; - if (tcpdatalen <= 0) /* No TCP data */ - goto clear_out; - - if (*data == NULL) { /* first TPKT */ - /* Get first TPKT pointer */ - tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen, - h323_buffer); - BUG_ON(tpkt == NULL); - - /* Validate TPKT identifier */ - if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { - /* Netmeeting sends TPKT header and data separately */ - if (info->tpkt_len[dir] > 0) { - DEBUGP("ip_ct_h323: previous packet " - "indicated separate TPKT data of %hu " - "bytes\n", info->tpkt_len[dir]); - if (info->tpkt_len[dir] <= tcpdatalen) { - /* Yes, there was a TPKT header - * received */ - *data = tpkt; - *datalen = info->tpkt_len[dir]; - *dataoff = 0; - goto out; - } - - /* Fragmented TPKT */ - if (net_ratelimit()) - printk("ip_ct_h323: " - "fragmented TPKT\n"); - goto clear_out; - } - - /* It is not even a TPKT */ - return 0; - } - tpktoff = 0; - } else { /* Next TPKT */ - tpktoff = *dataoff + *datalen; - tcpdatalen -= tpktoff; - if (tcpdatalen <= 4) /* No more TPKT */ - goto clear_out; - tpkt = *data + *datalen; - - /* Validate TPKT identifier */ - if (tpkt[0] != 0x03 || tpkt[1] != 0) - goto clear_out; - } - - /* Validate TPKT length */ - tpktlen = tpkt[2] * 256 + tpkt[3]; - if (tpktlen < 4) - goto clear_out; - if (tpktlen > tcpdatalen) { - if (tcpdatalen == 4) { /* Separate TPKT header */ - /* Netmeeting sends TPKT header and data separately */ - DEBUGP("ip_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", - tpktlen - 4); - info->tpkt_len[dir] = tpktlen - 4; - return 0; - } - - if (net_ratelimit()) - printk("ip_ct_h323: incomplete TPKT (fragmented?)\n"); - goto clear_out; - } - - /* This is the encapsulated data */ - *data = tpkt + 4; - *datalen = tpktlen - 4; - *dataoff = tpktoff + 4; - - out: - /* Clear TPKT length */ - info->tpkt_len[dir] = 0; - return 1; - - clear_out: - info->tpkt_len[dir] = 0; - return 0; -} - -/****************************************************************************/ -static int get_h245_addr(unsigned char *data, H245_TransportAddress * addr, - __be32 * ip, u_int16_t * port) -{ - unsigned char *p; - - if (addr->choice != eH245_TransportAddress_unicastAddress || - addr->unicastAddress.choice != eUnicastAddress_iPAddress) - return 0; - - p = data + addr->unicastAddress.iPAddress.network; - *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3])); - *port = (p[4] << 8) | (p[5]); - - return 1; -} - -/****************************************************************************/ -static int expect_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - u_int16_t rtp_port; - struct ip_conntrack_expect *rtp_exp; - struct ip_conntrack_expect *rtcp_exp; - typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; - - /* Read RTP or RTCP address */ - if (!get_h245_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* RTP port is even */ - rtp_port = port & (~1); - - /* Create expect for RTP */ - if ((rtp_exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - rtp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - rtp_exp->tuple.src.u.udp.port = 0; - rtp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - rtp_exp->tuple.dst.u.udp.port = htons(rtp_port); - rtp_exp->tuple.dst.protonum = IPPROTO_UDP; - rtp_exp->mask.src.ip = htonl(0xFFFFFFFF); - rtp_exp->mask.src.u.udp.port = 0; - rtp_exp->mask.dst.ip = htonl(0xFFFFFFFF); - rtp_exp->mask.dst.u.udp.port = htons(0xFFFF); - rtp_exp->mask.dst.protonum = 0xFF; - rtp_exp->flags = 0; - - /* Create expect for RTCP */ - if ((rtcp_exp = ip_conntrack_expect_alloc(ct)) == NULL) { - ip_conntrack_expect_put(rtp_exp); - return -1; - } - rtcp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - rtcp_exp->tuple.src.u.udp.port = 0; - rtcp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - rtcp_exp->tuple.dst.u.udp.port = htons(rtp_port + 1); - rtcp_exp->tuple.dst.protonum = IPPROTO_UDP; - rtcp_exp->mask.src.ip = htonl(0xFFFFFFFF); - rtcp_exp->mask.src.u.udp.port = 0; - rtcp_exp->mask.dst.ip = htonl(0xFFFFFFFF); - rtcp_exp->mask.dst.u.udp.port = htons(0xFFFF); - rtcp_exp->mask.dst.protonum = 0xFF; - rtcp_exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook))) { - /* NAT needed */ - ret = nat_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - addr, port, rtp_port, rtp_exp, rtcp_exp); - } else { /* Conntrack only */ - rtp_exp->expectfn = NULL; - rtcp_exp->expectfn = NULL; - - if (ip_conntrack_expect_related(rtp_exp) == 0) { - if (ip_conntrack_expect_related(rtcp_exp) == 0) { - DEBUGP("ip_ct_h323: expect RTP " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtp_exp->tuple.src.ip), - ntohs(rtp_exp->tuple.src.u.udp.port), - NIPQUAD(rtp_exp->tuple.dst.ip), - ntohs(rtp_exp->tuple.dst.u.udp.port)); - DEBUGP("ip_ct_h323: expect RTCP " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtcp_exp->tuple.src.ip), - ntohs(rtcp_exp->tuple.src.u.udp.port), - NIPQUAD(rtcp_exp->tuple.dst.ip), - ntohs(rtcp_exp->tuple.dst.u.udp.port)); - } else { - ip_conntrack_unexpect_related(rtp_exp); - ret = -1; - } - } else - ret = -1; - } - - ip_conntrack_expect_put(rtp_exp); - ip_conntrack_expect_put(rtcp_exp); - - return ret; -} - -/****************************************************************************/ -static int expect_t120(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_t120_hook) nat_t120; - - /* Read T.120 address */ - if (!get_h245_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* Create expect for T.120 connections */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple channels */ - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_t120 = rcu_dereference(nat_t120_hook))) { - /* NAT needed */ - ret = nat_t120(pskb, ct, ctinfo, data, dataoff, addr, - port, exp); - } else { /* Conntrack only */ - exp->expectfn = NULL; - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_h323: expect T.120 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_h245_channel(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H2250LogicalChannelParameters * channel) -{ - int ret; - - if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { - /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &channel->mediaChannel); - if (ret < 0) - return -1; - } - - if (channel-> - options & eH2250LogicalChannelParameters_mediaControlChannel) { - /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &channel->mediaControlChannel); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_olc(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - OpenLogicalChannel * olc) -{ - int ret; - - DEBUGP("ip_ct_h323: OpenLogicalChannel\n"); - - if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == - eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) - { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olc-> - forwardLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olc->options & - eOpenLogicalChannel_reverseLogicalChannelParameters) && - (olc->reverseLogicalChannelParameters.options & - eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters) - && (olc->reverseLogicalChannelParameters.multiplexParameters. - choice == - eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) - { - ret = - process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olc-> - reverseLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olc->options & eOpenLogicalChannel_separateStack) && - olc->forwardLogicalChannelParameters.dataType.choice == - eDataType_data && - olc->forwardLogicalChannelParameters.dataType.data.application. - choice == eDataApplicationCapability_application_t120 && - olc->forwardLogicalChannelParameters.dataType.data.application. - t120.choice == eDataProtocolCapability_separateLANStack && - olc->separateStack.networkAddress.choice == - eNetworkAccessParameters_networkAddress_localAreaAddress) { - ret = expect_t120(pskb, ct, ctinfo, data, dataoff, - &olc->separateStack.networkAddress. - localAreaAddress); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_olca(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - OpenLogicalChannelAck * olca) -{ - H2250LogicalChannelAckParameters *ack; - int ret; - - DEBUGP("ip_ct_h323: OpenLogicalChannelAck\n"); - - if ((olca->options & - eOpenLogicalChannelAck_reverseLogicalChannelParameters) && - (olca->reverseLogicalChannelParameters.options & - eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters) - && (olca->reverseLogicalChannelParameters.multiplexParameters. - choice == - eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) - { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olca-> - reverseLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olca->options & - eOpenLogicalChannelAck_forwardMultiplexAckParameters) && - (olca->forwardMultiplexAckParameters.choice == - eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters)) - { - ack = &olca->forwardMultiplexAckParameters. - h2250LogicalChannelAckParameters; - if (ack->options & - eH2250LogicalChannelAckParameters_mediaChannel) { - /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &ack->mediaChannel); - if (ret < 0) - return -1; - } - - if (ack->options & - eH2250LogicalChannelAckParameters_mediaControlChannel) { - /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &ack->mediaControlChannel); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - MultimediaSystemControlMessage * mscm) -{ - switch (mscm->choice) { - case eMultimediaSystemControlMessage_request: - if (mscm->request.choice == - eRequestMessage_openLogicalChannel) { - return process_olc(pskb, ct, ctinfo, data, dataoff, - &mscm->request.openLogicalChannel); - } - DEBUGP("ip_ct_h323: H.245 Request %d\n", - mscm->request.choice); - break; - case eMultimediaSystemControlMessage_response: - if (mscm->response.choice == - eResponseMessage_openLogicalChannelAck) { - return process_olca(pskb, ct, ctinfo, data, dataoff, - &mscm->response. - openLogicalChannelAck); - } - DEBUGP("ip_ct_h323: H.245 Response %d\n", - mscm->response.choice); - break; - default: - DEBUGP("ip_ct_h323: H.245 signal %d\n", mscm->choice); - break; - } - - return 0; -} - -/****************************************************************************/ -static int h245_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static MultimediaSystemControlMessage mscm; - unsigned char *data = NULL; - int datalen; - int dataoff; - int ret; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - return NF_ACCEPT; - } - DEBUGP("ip_ct_h245: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Process each TPKT */ - while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("ip_ct_h245: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode H.245 signal */ - ret = DecodeMultimediaSystemControlMessage(data, datalen, - &mscm); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_h245: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - /* We don't drop when decoding error */ - break; - } - - /* Process H.245 signal */ - if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0) - goto drop; - } - - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_h245: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_h245 = { - .name = "H.245", - .me = THIS_MODULE, - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */ , - .timeout = 240, - .tuple = {.dst = {.protonum = IPPROTO_TCP}}, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}}, - .help = h245_help -}; - -/****************************************************************************/ -void ip_conntrack_h245_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_h245; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -int get_h225_addr(unsigned char *data, TransportAddress * addr, - __be32 * ip, u_int16_t * port) -{ - unsigned char *p; - - if (addr->choice != eTransportAddress_ipAddress) - return 0; - - p = data + addr->ipAddress.ip; - *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3])); - *port = (p[4] << 8) | (p[5]); - - return 1; -} - -/****************************************************************************/ -static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_h245_hook) nat_h245; - - /* Read h245Address */ - if (!get_h225_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* Create expect for h245 connection */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_h245 = rcu_dereference(nat_h245_hook))) { - /* NAT needed */ - ret = nat_h245(pskb, ct, ctinfo, data, dataoff, addr, - port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_h245_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_q931: expect H.245 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/* Forwarding declaration */ -void ip_conntrack_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this); - -/****************************************************************************/ -static int expect_callforwarding(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_callforwarding_hook) nat_callforwarding; - - /* Read alternativeAddress */ - if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) - return 0; - - /* If the calling party is on the same side of the forward-to party, - * we don't need to track the second call */ - if (callforward_filter) { - struct rtable *rt1, *rt2; - struct flowi fl1 = { - .fl4_dst = ip, - }; - struct flowi fl2 = { - .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, - }; - - if (ip_route_output_key(&rt1, &fl1) == 0) { - if (ip_route_output_key(&rt2, &fl2) == 0) { - if (rt1->rt_gateway == rt2->rt_gateway && - rt1->u.dst.dev == rt2->u.dst.dev) - ret = 1; - dst_release(&rt2->u.dst); - } - dst_release(&rt1->u.dst); - } - if (ret) { - DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); - return 0; - } - } - - /* Create expect for the second call leg */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_callforwarding = rcu_dereference(nat_callforwarding_hook))) { - /* Need NAT */ - ret = nat_callforwarding(pskb, ct, ctinfo, data, dataoff, - addr, port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_q931: expect Call Forwarding " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Setup_UUIE * setup) -{ - int dir = CTINFO2DIR(ctinfo); - int ret; - int i; - __be32 ip; - u_int16_t port; - typeof(set_h225_addr_hook) set_h225_addr; - - DEBUGP("ip_ct_q931: Setup\n"); - - if (setup->options & eSetup_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &setup->h245Address); - if (ret < 0) - return -1; - } - - set_h225_addr = rcu_dereference(set_h225_addr_hook); - - if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && - get_h225_addr(*data, &setup->destCallSignalAddress, &ip, &port) && - ip != ct->tuplehash[!dir].tuple.src.ip) { - DEBUGP("ip_ct_q931: set destCallSignalAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.src.ip), - ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, - &setup->destCallSignalAddress, - ct->tuplehash[!dir].tuple.src.ip, - ntohs(ct->tuplehash[!dir].tuple.src. - u.tcp.port)); - if (ret < 0) - return -1; - } - - if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && - (set_h225_addr) && - get_h225_addr(*data, &setup->sourceCallSignalAddress, &ip, &port) - && ip != ct->tuplehash[!dir].tuple.dst.ip) { - DEBUGP("ip_ct_q931: set sourceCallSignalAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), - ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, - &setup->sourceCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - ntohs(ct->tuplehash[!dir].tuple.dst. - u.tcp.port)); - if (ret < 0) - return -1; - } - - if (setup->options & eSetup_UUIE_fastStart) { - for (i = 0; i < setup->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &setup->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_callproceeding(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - CallProceeding_UUIE * callproc) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: CallProceeding\n"); - - if (callproc->options & eCallProceeding_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &callproc->h245Address); - if (ret < 0) - return -1; - } - - if (callproc->options & eCallProceeding_UUIE_fastStart) { - for (i = 0; i < callproc->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &callproc->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_connect(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Connect_UUIE * connect) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Connect\n"); - - if (connect->options & eConnect_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &connect->h245Address); - if (ret < 0) - return -1; - } - - if (connect->options & eConnect_UUIE_fastStart) { - for (i = 0; i < connect->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &connect->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_alerting(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Alerting_UUIE * alert) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Alerting\n"); - - if (alert->options & eAlerting_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &alert->h245Address); - if (ret < 0) - return -1; - } - - if (alert->options & eAlerting_UUIE_fastStart) { - for (i = 0; i < alert->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &alert->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_information(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Information_UUIE * info) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Information\n"); - - if (info->options & eInformation_UUIE_fastStart) { - for (i = 0; i < info->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &info->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Facility_UUIE * facility) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Facility\n"); - - if (facility->reason.choice == eFacilityReason_callForwarded) { - if (facility->options & eFacility_UUIE_alternativeAddress) - return expect_callforwarding(pskb, ct, ctinfo, data, - dataoff, - &facility-> - alternativeAddress); - return 0; - } - - if (facility->options & eFacility_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &facility->h245Address); - if (ret < 0) - return -1; - } - - if (facility->options & eFacility_UUIE_fastStart) { - for (i = 0; i < facility->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &facility->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_progress(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Progress_UUIE * progress) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Progress\n"); - - if (progress->options & eProgress_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &progress->h245Address); - if (ret < 0) - return -1; - } - - if (progress->options & eProgress_UUIE_fastStart) { - for (i = 0; i < progress->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &progress->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, Q931 * q931) -{ - H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; - int i; - int ret = 0; - - switch (pdu->h323_message_body.choice) { - case eH323_UU_PDU_h323_message_body_setup: - ret = process_setup(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.setup); - break; - case eH323_UU_PDU_h323_message_body_callProceeding: - ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body. - callProceeding); - break; - case eH323_UU_PDU_h323_message_body_connect: - ret = process_connect(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.connect); - break; - case eH323_UU_PDU_h323_message_body_alerting: - ret = process_alerting(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.alerting); - break; - case eH323_UU_PDU_h323_message_body_information: - ret = process_information(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body. - information); - break; - case eH323_UU_PDU_h323_message_body_facility: - ret = process_facility(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.facility); - break; - case eH323_UU_PDU_h323_message_body_progress: - ret = process_progress(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.progress); - break; - default: - DEBUGP("ip_ct_q931: Q.931 signal %d\n", - pdu->h323_message_body.choice); - break; - } - - if (ret < 0) - return -1; - - if (pdu->options & eH323_UU_PDU_h245Control) { - for (i = 0; i < pdu->h245Control.count; i++) { - ret = process_h245(pskb, ct, ctinfo, data, dataoff, - &pdu->h245Control.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int q931_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static Q931 q931; - unsigned char *data = NULL; - int datalen; - int dataoff; - int ret; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - return NF_ACCEPT; - } - DEBUGP("ip_ct_q931: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Process each TPKT */ - while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("ip_ct_q931: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode Q.931 signal */ - ret = DecodeQ931(data, datalen, &q931); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_q931: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - /* We don't drop when decoding error */ - break; - } - - /* Process Q.931 signal */ - if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0) - goto drop; - } - - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_q931: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_q931 = { - .name = "Q.931", - .me = THIS_MODULE, - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4 /* T.120 and H.245 */ , - .timeout = 240, - .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(Q931_PORT)}}}, - .dst = {.protonum = IPPROTO_TCP}}, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}}, - .help = q931_help -}; - -/****************************************************************************/ -void ip_conntrack_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_q931; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -static unsigned char *get_udp_data(struct sk_buff **pskb, int *datalen) -{ - struct udphdr _uh, *uh; - int dataoff; - - uh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, sizeof(_uh), - &_uh); - if (uh == NULL) - return NULL; - dataoff = (*pskb)->nh.iph->ihl * 4 + sizeof(_uh); - if (dataoff >= (*pskb)->len) - return NULL; - *datalen = (*pskb)->len - dataoff; - return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer); -} - -/****************************************************************************/ -static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct, - __be32 ip, u_int16_t port) -{ - struct ip_conntrack_expect *exp; - struct ip_conntrack_tuple tuple; - - tuple.src.ip = 0; - tuple.src.u.tcp.port = 0; - tuple.dst.ip = ip; - tuple.dst.u.tcp.port = htons(port); - tuple.dst.protonum = IPPROTO_TCP; - - exp = __ip_conntrack_expect_find(&tuple); - if (exp && exp->master == ct) - return exp; - return NULL; -} - -/****************************************************************************/ -static int set_expect_timeout(struct ip_conntrack_expect *exp, - unsigned timeout) -{ - if (!exp || !del_timer(&exp->timeout)) - return 0; - - exp->timeout.expires = jiffies + timeout * HZ; - add_timer(&exp->timeout); - - return 1; -} - -/****************************************************************************/ -static int expect_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - int i; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - typeof(nat_q931_hook) nat_q931; - - /* Look for the first related address */ - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && port != 0) - break; - } - - if (i >= count) /* Not found */ - return 0; - - /* Create expect for Q.931 */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = gkrouted_only ? /* only accept calls from GK? */ - ct->tuplehash[!dir].tuple.src.ip : 0; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = gkrouted_only ? htonl(0xFFFFFFFF) : 0; - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple calls */ - - nat_q931 = rcu_dereference(nat_q931_hook); - if (nat_q931) { /* Need NAT */ - ret = nat_q931(pskb, ct, ctinfo, data, addr, i, port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - - /* Save port for looking up expect in processing RCF */ - info->sig_port[dir] = port; - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_grq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, GatekeeperRequest * grq) -{ - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: GRQ\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) /* NATed */ - return set_ras_addr(pskb, ct, ctinfo, data, - &grq->rasAddress, 1); - return 0; -} - -/* Declare before using */ -static void ip_conntrack_ras_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this); - -/****************************************************************************/ -static int process_gcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, GatekeeperConfirm * gcf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - - DEBUGP("ip_ct_ras: GCF\n"); - - if (!get_h225_addr(*data, &gcf->rasAddress, &ip, &port)) - return 0; - - /* Registration port is the same as discovery port */ - if (ip == ct->tuplehash[dir].tuple.src.ip && - port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) - return 0; - - /* Avoid RAS expectation loops. A GCF is never expected. */ - if (test_bit(IPS_EXPECTED_BIT, &ct->status)) - return 0; - - /* Need new expect */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_UDP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - exp->expectfn = ip_conntrack_ras_expect; - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect RAS " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_rrq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RegistrationRequest * rrq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int ret; - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: RRQ\n"); - - ret = expect_q931(pskb, ct, ctinfo, data, - rrq->callSignalAddress.item, - rrq->callSignalAddress.count); - if (ret < 0) - return -1; - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) { - ret = set_ras_addr(pskb, ct, ctinfo, data, - rrq->rasAddress.item, - rrq->rasAddress.count); - if (ret < 0) - return -1; - } - - if (rrq->options & eRegistrationRequest_timeToLive) { - DEBUGP("ip_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); - info->timeout = rrq->timeToLive; - } else - info->timeout = default_rrq_ttl; - - return 0; -} - -/****************************************************************************/ -static int process_rcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RegistrationConfirm * rcf) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret; - struct ip_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: RCF\n"); - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - rcf->callSignalAddress.item, - rcf->callSignalAddress.count); - if (ret < 0) - return -1; - } - - if (rcf->options & eRegistrationConfirm_timeToLive) { - DEBUGP("ip_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); - info->timeout = rcf->timeToLive; - } - - if (info->timeout > 0) { - DEBUGP - ("ip_ct_ras: set RAS connection timeout to %u seconds\n", - info->timeout); - ip_ct_refresh(ct, *pskb, info->timeout * HZ); - - /* Set expect timeout */ - read_lock_bh(&ip_conntrack_lock); - exp = find_expect(ct, ct->tuplehash[dir].tuple.dst.ip, - info->sig_port[!dir]); - if (exp) { - DEBUGP("ip_ct_ras: set Q.931 expect " - "(%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu) " - "timeout to %u seconds\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port), - info->timeout); - set_expect_timeout(exp, info->timeout); - } - read_unlock_bh(&ip_conntrack_lock); - } - - return 0; -} - -/****************************************************************************/ -static int process_urq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, UnregistrationRequest * urq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: URQ\n"); - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - urq->callSignalAddress.item, - urq->callSignalAddress.count); - if (ret < 0) - return -1; - } - - /* Clear old expect */ - ip_ct_remove_expectations(ct); - info->sig_port[dir] = 0; - info->sig_port[!dir] = 0; - - /* Give it 30 seconds for UCF or URJ */ - ip_ct_refresh(ct, *pskb, 30 * HZ); - - return 0; -} - -/****************************************************************************/ -static int process_arq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, AdmissionRequest * arq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - __be32 ip; - u_int16_t port; - typeof(set_h225_addr_hook) set_h225_addr; - - DEBUGP("ip_ct_ras: ARQ\n"); - - set_h225_addr = rcu_dereference(set_h225_addr_hook); - if ((arq->options & eAdmissionRequest_destCallSignalAddress) && - get_h225_addr(*data, &arq->destCallSignalAddress, &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && - port == info->sig_port[dir] && set_h225_addr) { - /* Answering ARQ */ - return set_h225_addr(pskb, data, 0, - &arq->destCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - info->sig_port[!dir]); - } - - if ((arq->options & eAdmissionRequest_srcCallSignalAddress) && - get_h225_addr(*data, &arq->srcCallSignalAddress, &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && set_h225_addr) { - /* Calling ARQ */ - return set_h225_addr(pskb, data, 0, - &arq->srcCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - port); - } - - return 0; -} - -/****************************************************************************/ -static int process_acf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, AdmissionConfirm * acf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: ACF\n"); - - if (!get_h225_addr(*data, &acf->destCallSignalAddress, &ip, &port)) - return 0; - - if (ip == ct->tuplehash[dir].tuple.dst.ip) { /* Answering ACF */ - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) - return set_sig_addr(pskb, ct, ctinfo, data, - &acf->destCallSignalAddress, 1); - return 0; - } - - /* Need new expect */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_lrq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, LocationRequest * lrq) -{ - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: LRQ\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) - return set_ras_addr(pskb, ct, ctinfo, data, - &lrq->replyAddress, 1); - return 0; -} - -/****************************************************************************/ -static int process_lcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, LocationConfirm * lcf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - - DEBUGP("ip_ct_ras: LCF\n"); - - if (!get_h225_addr(*data, &lcf->callSignalAddress, &ip, &port)) - return 0; - - /* Need new expect for call signal */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - /* Ignore rasAddress */ - - return ret; -} - -/****************************************************************************/ -static int process_irr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, InfoRequestResponse * irr) -{ - int ret; - typeof(set_ras_addr_hook) set_ras_addr; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: IRR\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) { - ret = set_ras_addr(pskb, ct, ctinfo, data, - &irr->rasAddress, 1); - if (ret < 0) - return -1; - } - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - irr->callSignalAddress.item, - irr->callSignalAddress.count); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_ras(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RasMessage * ras) -{ - switch (ras->choice) { - case eRasMessage_gatekeeperRequest: - return process_grq(pskb, ct, ctinfo, data, - &ras->gatekeeperRequest); - case eRasMessage_gatekeeperConfirm: - return process_gcf(pskb, ct, ctinfo, data, - &ras->gatekeeperConfirm); - case eRasMessage_registrationRequest: - return process_rrq(pskb, ct, ctinfo, data, - &ras->registrationRequest); - case eRasMessage_registrationConfirm: - return process_rcf(pskb, ct, ctinfo, data, - &ras->registrationConfirm); - case eRasMessage_unregistrationRequest: - return process_urq(pskb, ct, ctinfo, data, - &ras->unregistrationRequest); - case eRasMessage_admissionRequest: - return process_arq(pskb, ct, ctinfo, data, - &ras->admissionRequest); - case eRasMessage_admissionConfirm: - return process_acf(pskb, ct, ctinfo, data, - &ras->admissionConfirm); - case eRasMessage_locationRequest: - return process_lrq(pskb, ct, ctinfo, data, - &ras->locationRequest); - case eRasMessage_locationConfirm: - return process_lcf(pskb, ct, ctinfo, data, - &ras->locationConfirm); - case eRasMessage_infoRequestResponse: - return process_irr(pskb, ct, ctinfo, data, - &ras->infoRequestResponse); - default: - DEBUGP("ip_ct_ras: RAS message %d\n", ras->choice); - break; - } - - return 0; -} - -/****************************************************************************/ -static int ras_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static RasMessage ras; - unsigned char *data; - int datalen = 0; - int ret; - - DEBUGP("ip_ct_ras: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Get UDP data */ - data = get_udp_data(pskb, &datalen); - if (data == NULL) - goto accept; - DEBUGP("ip_ct_ras: RAS message %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode RAS message */ - ret = DecodeRasMessage(data, datalen, &ras); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_ras: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - goto accept; - } - - /* Process RAS message */ - if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0) - goto drop; - - accept: - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_ras: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_ras = { - .name = "RAS", - .me = THIS_MODULE, - .max_expected = 32, - .timeout = 240, - .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(RAS_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}}, - .mask = {.src = {.u = {0xFFFE}}, - .dst = {.protonum = 0xFF}}, - .help = ras_help, -}; - -/****************************************************************************/ -static void ip_conntrack_ras_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_ras; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -/* Not __exit - called from init() */ -static void fini(void) -{ - ip_conntrack_helper_unregister(&ip_conntrack_helper_ras); - ip_conntrack_helper_unregister(&ip_conntrack_helper_q931); - kfree(h323_buffer); - DEBUGP("ip_ct_h323: fini\n"); -} - -/****************************************************************************/ -static int __init init(void) -{ - int ret; - - h323_buffer = kmalloc(65536, GFP_KERNEL); - if (!h323_buffer) - return -ENOMEM; - if ((ret = ip_conntrack_helper_register(&ip_conntrack_helper_q931)) || - (ret = ip_conntrack_helper_register(&ip_conntrack_helper_ras))) { - fini(); - return ret; - } - DEBUGP("ip_ct_h323: init success\n"); - return 0; -} - -/****************************************************************************/ -module_init(init); -module_exit(fini); - -EXPORT_SYMBOL_GPL(get_h225_addr); -EXPORT_SYMBOL_GPL(ip_conntrack_h245_expect); -EXPORT_SYMBOL_GPL(ip_conntrack_q931_expect); -EXPORT_SYMBOL_GPL(set_h245_addr_hook); -EXPORT_SYMBOL_GPL(set_h225_addr_hook); -EXPORT_SYMBOL_GPL(set_sig_addr_hook); -EXPORT_SYMBOL_GPL(set_ras_addr_hook); -EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); -EXPORT_SYMBOL_GPL(nat_t120_hook); -EXPORT_SYMBOL_GPL(nat_h245_hook); -EXPORT_SYMBOL_GPL(nat_callforwarding_hook); -EXPORT_SYMBOL_GPL(nat_q931_hook); - -MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); -MODULE_DESCRIPTION("H.323 connection tracking helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c deleted file mode 100644 index 2b760c5cf70..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ /dev/null @@ -1,684 +0,0 @@ -/* - * ip_conntrack_pptp.c - Version 3.0 - * - * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * Limitations: - * - We blindly assume that control connections are always - * established in PNS->PAC direction. This is a violation - * of RFFC2673 - * - We can only support one single call within each session - * - * TODO: - * - testing of incoming PPTP calls - * - * Changes: - * 2002-02-05 - Version 1.3 - * - Call ip_conntrack_unexpect_related() from - * pptp_destroy_siblings() to destroy expectations in case - * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen - * (Philip Craig <philipc@snapgear.com>) - * - Add Version information at module loadtime - * 2002-02-10 - Version 1.6 - * - move to C99 style initializers - * - remove second expectation if first arrives - * 2004-10-22 - Version 2.0 - * - merge Mandrake's 2.6.x port with recent 2.6.x API changes - * - fix lots of linear skb assumptions from Mandrake's port - * 2005-06-10 - Version 2.1 - * - use ip_conntrack_expect_free() instead of kfree() on the - * expect's (which are from the slab for quite some time) - * 2005-06-10 - Version 3.0 - * - port helper to post-2.6.11 API changes, - * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/) - * 2005-07-30 - Version 3.1 - * - port helper to 2.6.13 API changes - * - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -#define IP_CT_PPTP_VERSION "3.1" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); - -static DEFINE_SPINLOCK(ip_pptp_lock); - -int -(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -int -(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -void -(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig, - struct ip_conntrack_expect *expect_reply); - -void -(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp); - -#if 0 -/* PptpControlMessageType names */ -const char *pptp_msg_name[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" -}; -EXPORT_SYMBOL(pptp_msg_name); -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -#define SECS *HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS - -#define PPTP_GRE_TIMEOUT (10 MINS) -#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS) - -static void pptp_expectfn(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - typeof(ip_nat_pptp_hook_expectfn) ip_nat_pptp_expectfn; - - DEBUGP("increasing timeouts\n"); - - /* increase timeout of GRE data channel conntrack entry */ - ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; - ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; - - /* Can you see how rusty this code is, compared with the pre-2.6.11 - * one? That's what happened to my shiny newnat of 2002 ;( -HW */ - - rcu_read_lock(); - ip_nat_pptp_expectfn = rcu_dereference(ip_nat_pptp_hook_expectfn); - if (!ip_nat_pptp_expectfn) { - struct ip_conntrack_tuple inv_t; - struct ip_conntrack_expect *exp_other; - - /* obviously this tuple inversion only works until you do NAT */ - invert_tuplepr(&inv_t, &exp->tuple); - DEBUGP("trying to unexpect other dir: "); - DUMP_TUPLE(&inv_t); - - exp_other = ip_conntrack_expect_find_get(&inv_t); - if (exp_other) { - /* delete other expectation. */ - DEBUGP("found\n"); - ip_conntrack_unexpect_related(exp_other); - ip_conntrack_expect_put(exp_other); - } else { - DEBUGP("not found\n"); - } - } else { - /* we need more than simple inversion */ - ip_nat_pptp_expectfn(ct, exp); - } - rcu_read_unlock(); -} - -static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_expect *exp; - - DEBUGP("trying to timeout ct or exp for tuple "); - DUMP_TUPLE(t); - - h = ip_conntrack_find_get(t, NULL); - if (h) { - struct ip_conntrack *sibling = tuplehash_to_ctrack(h); - DEBUGP("setting timeout of conntrack %p to 0\n", sibling); - sibling->proto.gre.timeout = 0; - sibling->proto.gre.stream_timeout = 0; - if (del_timer(&sibling->timeout)) - sibling->timeout.function((unsigned long)sibling); - ip_conntrack_put(sibling); - return 1; - } else { - exp = ip_conntrack_expect_find_get(t); - if (exp) { - DEBUGP("unexpect_related of expect %p\n", exp); - ip_conntrack_unexpect_related(exp); - ip_conntrack_expect_put(exp); - return 1; - } - } - - return 0; -} - - -/* timeout GRE data connections */ -static void pptp_destroy_siblings(struct ip_conntrack *ct) -{ - struct ip_conntrack_tuple t; - - ip_ct_gre_keymap_destroy(ct); - /* Since ct->sibling_list has literally rusted away in 2.6.11, - * we now need another way to find out about our sibling - * contrack and expects... -HW */ - - /* try original (pns->pac) tuple */ - memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); - t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = ct->help.ct_pptp_info.pns_call_id; - t.dst.u.gre.key = ct->help.ct_pptp_info.pac_call_id; - - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout original pns->pac ct/exp\n"); - - /* try reply (pac->pns) tuple */ - memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); - t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = ct->help.ct_pptp_info.pac_call_id; - t.dst.u.gre.key = ct->help.ct_pptp_info.pns_call_id; - - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout reply pac->pns ct/exp\n"); -} - -/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ -static inline int -exp_gre(struct ip_conntrack *ct, - __be16 callid, - __be16 peer_callid) -{ - struct ip_conntrack_expect *exp_orig, *exp_reply; - int ret = 1; - typeof(ip_nat_pptp_hook_exp_gre) ip_nat_pptp_exp_gre; - - exp_orig = ip_conntrack_expect_alloc(ct); - if (exp_orig == NULL) - goto out; - - exp_reply = ip_conntrack_expect_alloc(ct); - if (exp_reply == NULL) - goto out_put_orig; - - /* original direction, PNS->PAC */ - exp_orig->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - exp_orig->tuple.src.u.gre.key = peer_callid; - exp_orig->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - exp_orig->tuple.dst.u.gre.key = callid; - exp_orig->tuple.dst.protonum = IPPROTO_GRE; - - exp_orig->mask.src.ip = htonl(0xffffffff); - exp_orig->mask.src.u.all = 0; - exp_orig->mask.dst.u.gre.key = htons(0xffff); - exp_orig->mask.dst.ip = htonl(0xffffffff); - exp_orig->mask.dst.protonum = 0xff; - - exp_orig->master = ct; - exp_orig->expectfn = pptp_expectfn; - exp_orig->flags = 0; - - /* both expectations are identical apart from tuple */ - memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); - - /* reply direction, PAC->PNS */ - exp_reply->tuple.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - exp_reply->tuple.src.u.gre.key = callid; - exp_reply->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - exp_reply->tuple.dst.u.gre.key = peer_callid; - exp_reply->tuple.dst.protonum = IPPROTO_GRE; - - ip_nat_pptp_exp_gre = rcu_dereference(ip_nat_pptp_hook_exp_gre); - if (ip_nat_pptp_exp_gre) - ip_nat_pptp_exp_gre(exp_orig, exp_reply); - if (ip_conntrack_expect_related(exp_orig) != 0) - goto out_put_both; - if (ip_conntrack_expect_related(exp_reply) != 0) - goto out_unexpect_orig; - - /* Add GRE keymap entries */ - if (ip_ct_gre_keymap_add(ct, &exp_orig->tuple, 0) != 0) - goto out_unexpect_both; - if (ip_ct_gre_keymap_add(ct, &exp_reply->tuple, 1) != 0) { - ip_ct_gre_keymap_destroy(ct); - goto out_unexpect_both; - } - ret = 0; - -out_put_both: - ip_conntrack_expect_put(exp_reply); -out_put_orig: - ip_conntrack_expect_put(exp_orig); -out: - return ret; - -out_unexpect_both: - ip_conntrack_unexpect_related(exp_reply); -out_unexpect_orig: - ip_conntrack_unexpect_related(exp_orig); - goto out_put_both; -} - -static inline int -pptp_inbound_pkt(struct sk_buff **pskb, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq, - unsigned int reqlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg; - __be16 cid = 0, pcid = 0; - typeof(ip_nat_pptp_hook_inbound) ip_nat_pptp_inbound; - - msg = ntohs(ctlh->messageType); - DEBUGP("inbound control message %s\n", pptp_msg_name[msg]); - - switch (msg) { - case PPTP_START_SESSION_REPLY: - /* server confirms new control session */ - if (info->sstate < PPTP_SESSION_REQUESTED) - goto invalid; - if (pptpReq->srep.resultCode == PPTP_START_OK) - info->sstate = PPTP_SESSION_CONFIRMED; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_STOP_SESSION_REPLY: - /* server confirms end of control session */ - if (info->sstate > PPTP_SESSION_STOPREQ) - goto invalid; - if (pptpReq->strep.resultCode == PPTP_STOP_OK) - info->sstate = PPTP_SESSION_NONE; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_OUT_CALL_REPLY: - /* server accepted call, we now expect GRE frames */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - if (info->cstate != PPTP_CALL_OUT_REQ && - info->cstate != PPTP_CALL_OUT_CONF) - goto invalid; - - cid = pptpReq->ocack.callID; - pcid = pptpReq->ocack.peersCallID; - if (info->pns_call_id != pcid) - goto invalid; - DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); - - if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { - info->cstate = PPTP_CALL_OUT_CONF; - info->pac_call_id = cid; - exp_gre(ct, cid, pcid); - } else - info->cstate = PPTP_CALL_NONE; - break; - - case PPTP_IN_CALL_REQUEST: - /* server tells us about incoming call request */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - - cid = pptpReq->icreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->cstate = PPTP_CALL_IN_REQ; - info->pac_call_id = cid; - break; - - case PPTP_IN_CALL_CONNECT: - /* server tells us about incoming call established */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - if (info->cstate != PPTP_CALL_IN_REP && - info->cstate != PPTP_CALL_IN_CONF) - goto invalid; - - pcid = pptpReq->iccon.peersCallID; - cid = info->pac_call_id; - - if (info->pns_call_id != pcid) - goto invalid; - - DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); - info->cstate = PPTP_CALL_IN_CONF; - - /* we expect a GRE connection from PAC to PNS */ - exp_gre(ct, cid, pcid); - break; - - case PPTP_CALL_DISCONNECT_NOTIFY: - /* server confirms disconnect */ - cid = pptpReq->disc.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_destroy_siblings(ct); - break; - - case PPTP_WAN_ERROR_NOTIFY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - goto invalid; - } - - ip_nat_pptp_inbound = rcu_dereference(ip_nat_pptp_hook_inbound); - if (ip_nat_pptp_inbound) - return ip_nat_pptp_inbound(pskb, ct, ctinfo, ctlh, pptpReq); - return NF_ACCEPT; - -invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); - return NF_ACCEPT; -} - -static inline int -pptp_outbound_pkt(struct sk_buff **pskb, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq, - unsigned int reqlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg; - __be16 cid = 0, pcid = 0; - typeof(ip_nat_pptp_hook_outbound) ip_nat_pptp_outbound; - - msg = ntohs(ctlh->messageType); - DEBUGP("outbound control message %s\n", pptp_msg_name[msg]); - - switch (msg) { - case PPTP_START_SESSION_REQUEST: - /* client requests for new control session */ - if (info->sstate != PPTP_SESSION_NONE) - goto invalid; - info->sstate = PPTP_SESSION_REQUESTED; - break; - case PPTP_STOP_SESSION_REQUEST: - /* client requests end of control session */ - info->sstate = PPTP_SESSION_STOPREQ; - break; - - case PPTP_OUT_CALL_REQUEST: - /* client initiating connection to server */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - info->cstate = PPTP_CALL_OUT_REQ; - /* track PNS call id */ - cid = pptpReq->ocreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->pns_call_id = cid; - break; - case PPTP_IN_CALL_REPLY: - /* client answers incoming call */ - if (info->cstate != PPTP_CALL_IN_REQ && - info->cstate != PPTP_CALL_IN_REP) - goto invalid; - - cid = pptpReq->icack.callID; - pcid = pptpReq->icack.peersCallID; - if (info->pac_call_id != pcid) - goto invalid; - DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); - - if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { - /* part two of the three-way handshake */ - info->cstate = PPTP_CALL_IN_REP; - info->pns_call_id = cid; - } else - info->cstate = PPTP_CALL_NONE; - break; - - case PPTP_CALL_CLEAR_REQUEST: - /* client requests hangup of call */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - /* FUTURE: iterate over all calls and check if - * call ID is valid. We don't do this without newnat, - * because we only know about last call */ - info->cstate = PPTP_CALL_CLEAR_REQ; - break; - case PPTP_SET_LINK_INFO: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - goto invalid; - } - - ip_nat_pptp_outbound = rcu_dereference(ip_nat_pptp_hook_outbound); - if (ip_nat_pptp_outbound) - return ip_nat_pptp_outbound(pskb, ct, ctinfo, ctlh, pptpReq); - return NF_ACCEPT; - -invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); - return NF_ACCEPT; -} - -static const unsigned int pptp_msg_size[] = { - [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest), - [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply), - [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest), - [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply), - [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest), - [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply), - [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest), - [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply), - [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected), - [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest), - [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify), - [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify), - [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo), -}; - -/* track caller id inside control connection, call expect_related */ -static int -conntrack_pptp_help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) - -{ - int dir = CTINFO2DIR(ctinfo); - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - struct tcphdr _tcph, *tcph; - struct pptp_pkt_hdr _pptph, *pptph; - struct PptpControlHeader _ctlh, *ctlh; - union pptp_ctrl_union _pptpReq, *pptpReq; - unsigned int tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4; - unsigned int datalen, reqlen, nexthdr_off; - int oldsstate, oldcstate; - int ret; - u_int16_t msg; - - /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ctinfo = %u, skipping\n", ctinfo); - return NF_ACCEPT; - } - - nexthdr_off = (*pskb)->nh.iph->ihl*4; - tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph); - BUG_ON(!tcph); - nexthdr_off += tcph->doff * 4; - datalen = tcplen - tcph->doff * 4; - - pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph); - if (!pptph) { - DEBUGP("no full PPTP header, can't track\n"); - return NF_ACCEPT; - } - nexthdr_off += sizeof(_pptph); - datalen -= sizeof(_pptph); - - /* if it's not a control message we can't do anything with it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a control packet\n"); - return NF_ACCEPT; - } - - ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh); - if (!ctlh) - return NF_ACCEPT; - nexthdr_off += sizeof(_ctlh); - datalen -= sizeof(_ctlh); - - reqlen = datalen; - msg = ntohs(ctlh->messageType); - if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg]) - return NF_ACCEPT; - if (reqlen > sizeof(*pptpReq)) - reqlen = sizeof(*pptpReq); - - pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq); - if (!pptpReq) - return NF_ACCEPT; - - oldsstate = info->sstate; - oldcstate = info->cstate; - - spin_lock_bh(&ip_pptp_lock); - - /* FIXME: We just blindly assume that the control connection is always - * established from PNS->PAC. However, RFC makes no guarantee */ - if (dir == IP_CT_DIR_ORIGINAL) - /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, - ctinfo); - else - /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, - ctinfo); - DEBUGP("sstate: %d->%d, cstate: %d->%d\n", - oldsstate, info->sstate, oldcstate, info->cstate); - spin_unlock_bh(&ip_pptp_lock); - - return ret; -} - -/* control protocol helper */ -static struct ip_conntrack_helper pptp = { - .list = { NULL, NULL }, - .name = "pptp", - .me = THIS_MODULE, - .max_expected = 2, - .timeout = 5 * 60, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = __constant_htons(0xffff) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xff - } - }, - .help = conntrack_pptp_help, - .destroy = pptp_destroy_siblings, -}; - -extern void ip_ct_proto_gre_fini(void); -extern int __init ip_ct_proto_gre_init(void); - -/* ip_conntrack_pptp initialization */ -static int __init ip_conntrack_helper_pptp_init(void) -{ - int retcode; - - retcode = ip_ct_proto_gre_init(); - if (retcode < 0) - return retcode; - - DEBUGP(" registering helper\n"); - if ((retcode = ip_conntrack_helper_register(&pptp))) { - printk(KERN_ERR "Unable to register conntrack application " - "helper for pptp: %d\n", retcode); - ip_ct_proto_gre_fini(); - return retcode; - } - - printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION); - return 0; -} - -static void __exit ip_conntrack_helper_pptp_fini(void) -{ - ip_conntrack_helper_unregister(&pptp); - ip_ct_proto_gre_fini(); - printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION); -} - -module_init(ip_conntrack_helper_pptp_init); -module_exit(ip_conntrack_helper_pptp_fini); - -EXPORT_SYMBOL(ip_nat_pptp_hook_outbound); -EXPORT_SYMBOL(ip_nat_pptp_hook_inbound); -EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre); -EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c deleted file mode 100644 index 053e591f407..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ /dev/null @@ -1,314 +0,0 @@ -/* IRC extension for IP connection tracking, Version 1.21 - * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> - * based on RR's ip_conntrack_ftp.c - * - * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - ** - * Module load syntax: - * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS> - * max_dcc_channels=n dcc_timeout=secs - * - * please give the ports of all IRC servers You wish to connect to. - * If You don't specify ports, the default will be port 6667. - * With max_dcc_channels you can define the maximum number of not - * yet answered DCC channels per IRC session (default 8). - * With dcc_timeout you can specify how long the system waits for - * an expected DCC channel (default 300 seconds). - * - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_irc.h> -#include <linux/moduleparam.h> - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -static unsigned int max_dcc_channels = 8; -static unsigned int dcc_timeout = 300; -/* This is slow, but it's simple. --RR */ -static char *irc_buffer; -static DEFINE_SPINLOCK(irc_buffer_lock); - -unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_irc_hook); - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); -MODULE_LICENSE("GPL"); -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of IRC servers"); -module_param(max_dcc_channels, uint, 0400); -MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); -module_param(dcc_timeout, uint, 0400); -MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); - -static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; -#define MINMATCHLEN 5 - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - -static int parse_dcc(char *data, char *data_end, u_int32_t *ip, - u_int16_t *port, char **ad_beg_p, char **ad_end_p) -/* tries to get the ip_addr and port out of a dcc command - return value: -1 on failure, 0 on success - data pointer to first byte of DCC command data - data_end pointer to last byte of dcc command data - ip returns parsed ip of dcc command - port returns parsed port of dcc command - ad_beg_p returns pointer to first byte of addr data - ad_end_p returns pointer to last byte of addr data */ -{ - - /* at least 12: "AAAAAAAA P\1\n" */ - while (*data++ != ' ') - if (data > data_end - 12) - return -1; - - *ad_beg_p = data; - *ip = simple_strtoul(data, &data, 10); - - /* skip blanks between ip and port */ - while (*data == ' ') { - if (data >= data_end) - return -1; - data++; - } - - *port = simple_strtoul(data, &data, 10); - *ad_end_p = data; - - return 0; -} - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff; - struct tcphdr _tcph, *th; - char *data, *data_limit, *ib_ptr; - int dir = CTINFO2DIR(ctinfo); - struct ip_conntrack_expect *exp; - u32 seq; - u_int32_t dcc_ip; - u_int16_t dcc_port; - int i, ret = NF_ACCEPT; - char *addr_beg_p, *addr_end_p; - typeof(ip_nat_irc_hook) ip_nat_irc; - - DEBUGP("entered\n"); - - /* If packet is coming from IRC server */ - if (dir == IP_CT_DIR_REPLY) - return NF_ACCEPT; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - DEBUGP("Conntrackinfo = %u\n", ctinfo); - return NF_ACCEPT; - } - - /* Not a full tcp header? */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return NF_ACCEPT; - - /* No data? */ - dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; - if (dataoff >= (*pskb)->len) - return NF_ACCEPT; - - spin_lock_bh(&irc_buffer_lock); - ib_ptr = skb_header_pointer(*pskb, dataoff, - (*pskb)->len - dataoff, irc_buffer); - BUG_ON(ib_ptr == NULL); - - data = ib_ptr; - data_limit = ib_ptr + (*pskb)->len - dataoff; - - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < (data_limit - (19 + MINMATCHLEN))) { - if (memcmp(data, "\1DCC ", 5)) { - data++; - continue; - } - - data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ - - DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest)); - - for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { - if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { - /* no match */ - continue; - } - - DEBUGP("DCC %s detected\n", dccprotos[i]); - data += strlen(dccprotos[i]); - /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid - * data left (== 14/13 bytes) */ - if (parse_dcc((char *)data, data_limit, &dcc_ip, - &dcc_port, &addr_beg_p, &addr_end_p)) { - /* unable to parse */ - DEBUGP("unable to parse dcc command\n"); - continue; - } - DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", - HIPQUAD(dcc_ip), dcc_port); - - /* dcc_ip can be the internal OR external (NAT'ed) IP - * Tiago Sousa <mirage@kaotik.org> */ - if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip) - && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) { - if (net_ratelimit()) - printk(KERN_WARNING - "Forged DCC command from " - "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", - NIPQUAD(ct->tuplehash[dir].tuple.src.ip), - HIPQUAD(dcc_ip), dcc_port); - - continue; - } - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - /* save position of address in dcc string, - * necessary for NAT */ - DEBUGP("tcph->seq = %u\n", th->seq); - seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); - - /* We refer to the reverse direction ("!dir") - * tuples here, because we're expecting - * something in the other * direction. - * Doesn't matter unless NAT is happening. */ - exp->tuple = ((struct ip_conntrack_tuple) - { { 0, { 0 } }, - { ct->tuplehash[!dir].tuple.dst.ip, - { .tcp = { htons(dcc_port) } }, - IPPROTO_TCP }}); - exp->mask = ((struct ip_conntrack_tuple) - { { 0, { 0 } }, - { htonl(0xFFFFFFFF), - { .tcp = { htons(0xFFFF) } }, 0xFF }}); - exp->expectfn = NULL; - exp->flags = 0; - ip_nat_irc = rcu_dereference(ip_nat_irc_hook); - if (ip_nat_irc) - ret = ip_nat_irc(pskb, ctinfo, - addr_beg_p - ib_ptr, - addr_end_p - addr_beg_p, - exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - goto out; - } /* for .. NUM_DCCPROTO */ - } /* while data < ... */ - - out: - spin_unlock_bh(&irc_buffer_lock); - return ret; -} - -static struct ip_conntrack_helper irc_helpers[MAX_PORTS]; -static char irc_names[MAX_PORTS][sizeof("irc-65535")]; - -static void ip_conntrack_irc_fini(void); - -static int __init ip_conntrack_irc_init(void) -{ - int i, ret; - struct ip_conntrack_helper *hlpr; - char *tmpname; - - if (max_dcc_channels < 1) { - printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); - return -EBUSY; - } - - irc_buffer = kmalloc(65536, GFP_KERNEL); - if (!irc_buffer) - return -ENOMEM; - - /* If no port given, default to standard irc port */ - if (ports_c == 0) - ports[ports_c++] = IRC_PORT; - - for (i = 0; i < ports_c; i++) { - hlpr = &irc_helpers[i]; - hlpr->tuple.src.u.tcp.port = htons(ports[i]); - hlpr->tuple.dst.protonum = IPPROTO_TCP; - hlpr->mask.src.u.tcp.port = htons(0xFFFF); - hlpr->mask.dst.protonum = 0xFF; - hlpr->max_expected = max_dcc_channels; - hlpr->timeout = dcc_timeout; - hlpr->me = THIS_MODULE; - hlpr->help = help; - - tmpname = &irc_names[i][0]; - if (ports[i] == IRC_PORT) - sprintf(tmpname, "irc"); - else - sprintf(tmpname, "irc-%d", i); - hlpr->name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret = ip_conntrack_helper_register(hlpr); - - if (ret) { - printk("ip_conntrack_irc: ERROR registering port %d\n", - ports[i]); - ip_conntrack_irc_fini(); - return -EBUSY; - } - } - return 0; -} - -/* This function is intentionally _NOT_ defined as __exit, because - * it is needed by the init function */ -static void ip_conntrack_irc_fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("unregistering port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&irc_helpers[i]); - } - kfree(irc_buffer); -} - -module_init(ip_conntrack_irc_init); -module_exit(ip_conntrack_irc_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c deleted file mode 100644 index cc6dd49c9da..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * NetBIOS name service broadcast connection tracking helper - * - * (c) 2005 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -/* - * This helper tracks locally originating NetBIOS name service - * requests by issuing permanent expectations (valid until - * timing out) matching all reply connections from the - * destination network. The only NetBIOS specific thing is - * actually the port number. - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_addr.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#define NMBD_PORT 137 - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); -MODULE_LICENSE("GPL"); - -static unsigned int timeout = 3; -module_param(timeout, uint, 0400); -MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - struct ip_conntrack_expect *exp; - struct iphdr *iph = (*pskb)->nh.iph; - struct rtable *rt = (struct rtable *)(*pskb)->dst; - struct in_device *in_dev; - __be32 mask = 0; - - /* we're only interested in locally generated packets */ - if ((*pskb)->sk == NULL) - goto out; - if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) - goto out; - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - goto out; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); - if (in_dev != NULL) { - for_primary_ifa(in_dev) { - if (ifa->ifa_broadcast == iph->daddr) { - mask = ifa->ifa_mask; - break; - } - } endfor_ifa(in_dev); - } - rcu_read_unlock(); - - if (mask == 0) - goto out; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - goto out; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = htons(NMBD_PORT); - - exp->mask.src.ip = mask; - exp->mask.src.u.udp.port = htons(0xFFFF); - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.udp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - - exp->expectfn = NULL; - exp->flags = IP_CT_EXPECT_PERMANENT; - - ip_conntrack_expect_related(exp); - ip_conntrack_expect_put(exp); - - ip_ct_refresh(ct, *pskb, timeout * HZ); -out: - return NF_ACCEPT; -} - -static struct ip_conntrack_helper helper = { - .name = "netbios-ns", - .tuple = { - .src = { - .u = { - .udp = { - .port = __constant_htons(NMBD_PORT), - } - } - }, - .dst = { - .protonum = IPPROTO_UDP, - }, - }, - .mask = { - .src = { - .u = { - .udp = { - .port = __constant_htons(0xFFFF), - } - } - }, - .dst = { - .protonum = 0xFF, - }, - }, - .max_expected = 1, - .me = THIS_MODULE, - .help = help, -}; - -static int __init ip_conntrack_netbios_ns_init(void) -{ - helper.timeout = timeout; - return ip_conntrack_helper_register(&helper); -} - -static void __exit ip_conntrack_netbios_ns_fini(void) -{ - ip_conntrack_helper_unregister(&helper); -} - -module_init(ip_conntrack_netbios_ns_init); -module_exit(ip_conntrack_netbios_ns_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c deleted file mode 100644 index 9228b76ccd9..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ /dev/null @@ -1,1577 +0,0 @@ -/* Connection tracking via netlink socket. Allows for user space - * protocol helpers and general trouble making from userspace. - * - * (C) 2001 by Jay Schulist <jschlst@samba.org> - * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> - * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net> - * - * I've reworked this stuff to use attributes instead of conntrack - * structures. 5.44 am. I need more tea. --pablo 05/07/11. - * - * Initial connection tracking via netlink development funded and - * generally made possible by Network Robots, Inc. (www.networkrobots.com) - * - * Further development of this code funded by Astaro AG (http://www.astaro.com) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/notifier.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -MODULE_LICENSE("GPL"); - -static char __initdata version[] = "0.90"; - -static inline int -ctnetlink_dump_tuples_proto(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *proto) -{ - int ret = 0; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); - - NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); - - if (likely(proto->tuple_to_nfattr)) - ret = proto->tuple_to_nfattr(skb, tuple); - - NFA_NEST_END(skb, nest_parms); - - return ret; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); - - NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(__be32), &tuple->src.ip); - NFA_PUT(skb, CTA_IP_V4_DST, sizeof(__be32), &tuple->dst.ip); - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_tuples(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - int ret; - struct ip_conntrack_protocol *proto; - - ret = ctnetlink_dump_tuples_ip(skb, tuple); - if (unlikely(ret < 0)) - return ret; - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, tuple, proto); - ip_conntrack_proto_put(proto); - - return ret; -} - -static inline int -ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 status = htonl((u_int32_t) ct->status); - NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - long timeout_l = ct->timeout.expires - jiffies; - __be32 timeout; - - if (timeout_l < 0) - timeout = 0; - else - timeout = htonl(timeout_l / HZ); - - NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - - struct nfattr *nest_proto; - int ret; - - if (!proto->to_nfattr) { - ip_conntrack_proto_put(proto); - return 0; - } - - nest_proto = NFA_NEST(skb, CTA_PROTOINFO); - - ret = proto->to_nfattr(skb, nest_proto, ct); - - ip_conntrack_proto_put(proto); - - NFA_NEST_END(skb, nest_proto); - - return ret; - -nfattr_failure: - ip_conntrack_proto_put(proto); - return -1; -} - -static inline int -ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - struct nfattr *nest_helper; - - if (!ct->helper) - return 0; - - nest_helper = NFA_NEST(skb, CTA_HELP); - NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name); - - if (ct->helper->to_nfattr) - ct->helper->to_nfattr(skb, ct); - - NFA_NEST_END(skb, nest_helper); - - return 0; - -nfattr_failure: - return -1; -} - -#ifdef CONFIG_IP_NF_CT_ACCT -static inline int -ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, - enum ip_conntrack_dir dir) -{ - enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; - struct nfattr *nest_count = NFA_NEST(skb, type); - __be32 tmp; - - tmp = htonl(ct->counters[dir].packets); - NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(__be32), &tmp); - - tmp = htonl(ct->counters[dir].bytes); - NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(__be32), &tmp); - - NFA_NEST_END(skb, nest_count); - - return 0; - -nfattr_failure: - return -1; -} -#else -#define ctnetlink_dump_counters(a, b, c) (0) -#endif - -#ifdef CONFIG_IP_NF_CONNTRACK_MARK -static inline int -ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 mark = htonl(ct->mark); - - NFA_PUT(skb, CTA_MARK, sizeof(__be32), &mark); - return 0; - -nfattr_failure: - return -1; -} -#else -#define ctnetlink_dump_mark(a, b) (0) -#endif - -static inline int -ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 id = htonl(ct->id); - NFA_PUT(skb, CTA_ID, sizeof(__be32), &id); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 use = htonl(atomic_read(&ct->ct_general.use)); - - NFA_PUT(skb, CTA_USE, sizeof(__be32), &use); - return 0; - -nfattr_failure: - return -1; -} - -#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) - -static int -ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, int nowait, - const struct ip_conntrack *ct) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - unsigned char *b; - - b = skb->tail; - - event |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -nlmsg_failure: -nfattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static int ctnetlink_conntrack_event(struct notifier_block *this, - unsigned long events, void *ptr) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - struct ip_conntrack *ct = (struct ip_conntrack *)ptr; - struct sk_buff *skb; - unsigned int type; - unsigned char *b; - unsigned int flags = 0, group; - - /* ignore our fake conntrack entry */ - if (ct == &ip_conntrack_untracked) - return NOTIFY_DONE; - - if (events & IPCT_DESTROY) { - type = IPCTNL_MSG_CT_DELETE; - group = NFNLGRP_CONNTRACK_DESTROY; - } else if (events & (IPCT_NEW | IPCT_RELATED)) { - type = IPCTNL_MSG_CT_NEW; - flags = NLM_F_CREATE|NLM_F_EXCL; - group = NFNLGRP_CONNTRACK_NEW; - } else if (events & (IPCT_STATUS | IPCT_PROTOINFO)) { - type = IPCTNL_MSG_CT_NEW; - group = NFNLGRP_CONNTRACK_UPDATE; - } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(group)) - return NOTIFY_DONE; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; - - b = skb->tail; - - type |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = flags; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - if (events & IPCT_DESTROY) { - if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) - goto nfattr_failure; - } else { - if (ctnetlink_dump_status(skb, ct) < 0) - goto nfattr_failure; - - if (ctnetlink_dump_timeout(skb, ct) < 0) - goto nfattr_failure; - - if (events & IPCT_PROTOINFO - && ctnetlink_dump_protoinfo(skb, ct) < 0) - goto nfattr_failure; - - if ((events & IPCT_HELPER || ct->helper) - && ctnetlink_dump_helpinfo(skb, ct) < 0) - goto nfattr_failure; - -#ifdef CONFIG_IP_NF_CONNTRACK_MARK - if ((events & IPCT_MARK || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) - goto nfattr_failure; -#endif - - if (events & IPCT_COUNTER_FILLING && - (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)) - goto nfattr_failure; - } - - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, group, 0); - return NOTIFY_DONE; - -nlmsg_failure: -nfattr_failure: - kfree_skb(skb); - return NOTIFY_DONE; -} -#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ - -static int ctnetlink_done(struct netlink_callback *cb) -{ - if (cb->args[1]) - ip_conntrack_put((struct ip_conntrack *)cb->args[1]); - return 0; -} - -static int -ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ip_conntrack *ct, *last; - struct ip_conntrack_tuple_hash *h; - struct list_head *i; - - read_lock_bh(&ip_conntrack_lock); - last = (struct ip_conntrack *)cb->args[1]; - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { -restart: - list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { - h = (struct ip_conntrack_tuple_hash *) i; - if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; - ct = tuplehash_to_ctrack(h); - if (cb->args[1]) { - if (ct != last) - continue; - cb->args[1] = 0; - } - if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, - 1, ct) < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; - goto out; - } -#ifdef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == - IPCTNL_MSG_CT_GET_CTRZERO) - memset(&ct->counters, 0, sizeof(ct->counters)); -#endif - } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; - } - } -out: - read_unlock_bh(&ip_conntrack_lock); - if (last) - ip_conntrack_put(last); - - return skb->len; -} - -static const size_t cta_min_ip[CTA_IP_MAX] = { - [CTA_IP_V4_SRC-1] = sizeof(__be32), - [CTA_IP_V4_DST-1] = sizeof(__be32), -}; - -static inline int -ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) -{ - struct nfattr *tb[CTA_IP_MAX]; - - nfattr_parse_nested(tb, CTA_IP_MAX, attr); - - if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) - return -EINVAL; - - if (!tb[CTA_IP_V4_SRC-1]) - return -EINVAL; - tuple->src.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); - - if (!tb[CTA_IP_V4_DST-1]) - return -EINVAL; - tuple->dst.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]); - - return 0; -} - -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int8_t), - [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), - [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), - [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), -}; - -static inline int -ctnetlink_parse_tuple_proto(struct nfattr *attr, - struct ip_conntrack_tuple *tuple) -{ - struct nfattr *tb[CTA_PROTO_MAX]; - struct ip_conntrack_protocol *proto; - int ret = 0; - - nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) - return -EINVAL; - - if (!tb[CTA_PROTO_NUM-1]) - return -EINVAL; - tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - - if (likely(proto->nfattr_to_tuple)) - ret = proto->nfattr_to_tuple(tb, tuple); - - ip_conntrack_proto_put(proto); - - return ret; -} - -static inline int -ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, - enum ctattr_tuple type) -{ - struct nfattr *tb[CTA_TUPLE_MAX]; - int err; - - memset(tuple, 0, sizeof(*tuple)); - - nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); - - if (!tb[CTA_TUPLE_IP-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); - if (err < 0) - return err; - - if (!tb[CTA_TUPLE_PROTO-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); - if (err < 0) - return err; - - /* orig and expect tuples get DIR_ORIGINAL */ - if (type == CTA_TUPLE_REPLY) - tuple->dst.dir = IP_CT_DIR_REPLY; - else - tuple->dst.dir = IP_CT_DIR_ORIGINAL; - - return 0; -} - -#ifdef CONFIG_IP_NF_NAT_NEEDED -static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { - [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), - [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), -}; - -static int ctnetlink_parse_nat_proto(struct nfattr *attr, - const struct ip_conntrack *ct, - struct ip_nat_range *range) -{ - struct nfattr *tb[CTA_PROTONAT_MAX]; - struct ip_nat_protocol *npt; - - nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - return -EINVAL; - - npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - - if (!npt->nfattr_to_range) { - ip_nat_proto_put(npt); - return 0; - } - - /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ - if (npt->nfattr_to_range(tb, range) > 0) - range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - - ip_nat_proto_put(npt); - - return 0; -} - -static const size_t cta_min_nat[CTA_NAT_MAX] = { - [CTA_NAT_MINIP-1] = sizeof(__be32), - [CTA_NAT_MAXIP-1] = sizeof(__be32), -}; - -static inline int -ctnetlink_parse_nat(struct nfattr *nat, - const struct ip_conntrack *ct, struct ip_nat_range *range) -{ - struct nfattr *tb[CTA_NAT_MAX]; - int err; - - memset(range, 0, sizeof(*range)); - - nfattr_parse_nested(tb, CTA_NAT_MAX, nat); - - if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) - return -EINVAL; - - if (tb[CTA_NAT_MINIP-1]) - range->min_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MINIP-1]); - - if (!tb[CTA_NAT_MAXIP-1]) - range->max_ip = range->min_ip; - else - range->max_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); - - if (range->min_ip) - range->flags |= IP_NAT_RANGE_MAP_IPS; - - if (!tb[CTA_NAT_PROTO-1]) - return 0; - - err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); - if (err < 0) - return err; - - return 0; -} -#endif - -static inline int -ctnetlink_parse_help(struct nfattr *attr, char **helper_name) -{ - struct nfattr *tb[CTA_HELP_MAX]; - - nfattr_parse_nested(tb, CTA_HELP_MAX, attr); - - if (!tb[CTA_HELP_NAME-1]) - return -EINVAL; - - *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); - - return 0; -} - -static const size_t cta_min[CTA_MAX] = { - [CTA_STATUS-1] = sizeof(__be32), - [CTA_TIMEOUT-1] = sizeof(__be32), - [CTA_MARK-1] = sizeof(__be32), - [CTA_USE-1] = sizeof(__be32), - [CTA_ID-1] = sizeof(__be32) -}; - -static int -ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - struct ip_conntrack *ct; - int err = 0; - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else { - /* Flush the whole table */ - ip_conntrack_flush(); - return 0; - } - - if (err < 0) - return err; - - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; - - ct = tuplehash_to_ctrack(h); - - if (cda[CTA_ID-1]) { - u_int32_t id = ntohl(*(__be32 *)NFA_DATA(cda[CTA_ID-1])); - if (ct->id != id) { - ip_conntrack_put(ct); - return -ENOENT; - } - } - if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - - ip_conntrack_put(ct); - - return 0; -} - -static int -ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - struct ip_conntrack *ct; - struct sk_buff *skb2 = NULL; - int err = 0; - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct nfgenmsg *msg = NLMSG_DATA(nlh); - u32 rlen; - - if (msg->nfgen_family != AF_INET) - return -EAFNOSUPPORT; - -#ifndef CONFIG_IP_NF_CT_ACCT - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) - return -ENOTSUPP; -#endif - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; - } - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else - return -EINVAL; - - if (err < 0) - return err; - - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; - - ct = tuplehash_to_ctrack(h); - - err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) { - ip_conntrack_put(ct); - return -ENOMEM; - } - - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, 1, ct); - ip_conntrack_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; - -free: - kfree_skb(skb2); -out: - return err; -} - -static inline int -ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - unsigned long d; - unsigned status = ntohl(*(__be32 *)NFA_DATA(cda[CTA_STATUS-1])); - d = ct->status ^ status; - - if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) - /* unchangeable */ - return -EINVAL; - - if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) - /* SEEN_REPLY bit can only be set */ - return -EINVAL; - - - if (d & IPS_ASSURED && !(status & IPS_ASSURED)) - /* ASSURED bit can only be set */ - return -EINVAL; - - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { -#ifndef CONFIG_IP_NF_NAT_NEEDED - return -EINVAL; -#else - struct ip_nat_range range; - - if (cda[CTA_NAT_DST-1]) { - if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, - &range) < 0) - return -EINVAL; - if (ip_nat_initialized(ct, - HOOK2MANIP(NF_IP_PRE_ROUTING))) - return -EEXIST; - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); - } - if (cda[CTA_NAT_SRC-1]) { - if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, - &range) < 0) - return -EINVAL; - if (ip_nat_initialized(ct, - HOOK2MANIP(NF_IP_POST_ROUTING))) - return -EEXIST; - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - } -#endif - } - - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * ip_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); - return 0; -} - - -static inline int -ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - struct ip_conntrack_helper *helper; - char *helpname; - int err; - - /* don't change helper of sibling connections */ - if (ct->master) - return -EINVAL; - - err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); - if (err < 0) - return err; - - helper = __ip_conntrack_helper_find_byname(helpname); - if (!helper) { - if (!strcmp(helpname, "")) - helper = NULL; - else - return -EINVAL; - } - - if (ct->helper) { - if (!helper) { - /* we had a helper before ... */ - ip_ct_remove_expectations(ct); - ct->helper = NULL; - } else { - /* need to zero data of old helper */ - memset(&ct->help, 0, sizeof(ct->help)); - } - } - - ct->helper = helper; - - return 0; -} - -static inline int -ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - u_int32_t timeout = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); - - if (!del_timer(&ct->timeout)) - return -ETIME; - - ct->timeout.expires = jiffies + timeout * HZ; - add_timer(&ct->timeout); - - return 0; -} - -static inline int -ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1]; - struct ip_conntrack_protocol *proto; - u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - int err = 0; - - nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); - - proto = ip_conntrack_proto_find_get(npt); - - if (proto->from_nfattr) - err = proto->from_nfattr(tb, ct); - ip_conntrack_proto_put(proto); - - return err; -} - -static int -ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - int err; - - if (cda[CTA_HELP-1]) { - err = ctnetlink_change_helper(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_TIMEOUT-1]) { - err = ctnetlink_change_timeout(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_STATUS-1]) { - err = ctnetlink_change_status(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_PROTOINFO-1]) { - err = ctnetlink_change_protoinfo(ct, cda); - if (err < 0) - return err; - } - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); -#endif - - return 0; -} - -static int -ctnetlink_create_conntrack(struct nfattr *cda[], - struct ip_conntrack_tuple *otuple, - struct ip_conntrack_tuple *rtuple) -{ - struct ip_conntrack *ct; - int err = -EINVAL; - - ct = ip_conntrack_alloc(otuple, rtuple); - if (ct == NULL || IS_ERR(ct)) - return -ENOMEM; - - if (!cda[CTA_TIMEOUT-1]) - goto err; - ct->timeout.expires = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); - - ct->timeout.expires = jiffies + ct->timeout.expires * HZ; - ct->status |= IPS_CONFIRMED; - - if (cda[CTA_STATUS-1]) { - err = ctnetlink_change_status(ct, cda); - if (err < 0) - goto err; - } - - if (cda[CTA_PROTOINFO-1]) { - err = ctnetlink_change_protoinfo(ct, cda); - if (err < 0) - goto err; - } - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); -#endif - - ct->helper = ip_conntrack_helper_find_get(rtuple); - - add_timer(&ct->timeout); - ip_conntrack_hash_insert(ct); - - if (ct->helper) - ip_conntrack_helper_put(ct->helper); - - return 0; - -err: - ip_conntrack_free(ct); - return err; -} - -static int -ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple otuple, rtuple; - struct ip_conntrack_tuple_hash *h = NULL; - int err = 0; - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) { - err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); - if (err < 0) - return err; - } - - if (cda[CTA_TUPLE_REPLY-1]) { - err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); - if (err < 0) - return err; - } - - write_lock_bh(&ip_conntrack_lock); - if (cda[CTA_TUPLE_ORIG-1]) - h = __ip_conntrack_find(&otuple, NULL); - else if (cda[CTA_TUPLE_REPLY-1]) - h = __ip_conntrack_find(&rtuple, NULL); - - if (h == NULL) { - write_unlock_bh(&ip_conntrack_lock); - err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); - return err; - } - /* implicit 'else' */ - - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { - err = -EINVAL; - goto out_unlock; - } - - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ - err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) - err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); - -out_unlock: - write_unlock_bh(&ip_conntrack_lock); - return err; -} - -/*********************************************************************** - * EXPECT - ***********************************************************************/ - -static inline int -ctnetlink_exp_dump_tuple(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - enum ctattr_expect type) -{ - struct nfattr *nest_parms = NFA_NEST(skb, type); - - if (ctnetlink_dump_tuples(skb, tuple) < 0) - goto nfattr_failure; - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_exp_dump_mask(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) -{ - int ret; - struct ip_conntrack_protocol *proto; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK); - - ret = ctnetlink_dump_tuples_ip(skb, mask); - if (unlikely(ret < 0)) - goto nfattr_failure; - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, mask, proto); - ip_conntrack_proto_put(proto); - if (unlikely(ret < 0)) - goto nfattr_failure; - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_exp_dump_expect(struct sk_buff *skb, - const struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *master = exp->master; - __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ); - __be32 id = htonl(exp->id); - - if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) - goto nfattr_failure; - if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) - goto nfattr_failure; - if (ctnetlink_exp_dump_tuple(skb, - &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - CTA_EXPECT_MASTER) < 0) - goto nfattr_failure; - - NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(__be32), &timeout); - NFA_PUT(skb, CTA_EXPECT_ID, sizeof(__be32), &id); - - return 0; - -nfattr_failure: - return -1; -} - -static int -ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, - int nowait, - const struct ip_conntrack_expect *exp) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - unsigned char *b; - - b = skb->tail; - - event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -nlmsg_failure: -nfattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static int ctnetlink_expect_event(struct notifier_block *this, - unsigned long events, void *ptr) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; - struct sk_buff *skb; - unsigned int type; - unsigned char *b; - int flags = 0; - - if (events & IPEXP_NEW) { - type = IPCTNL_MSG_EXP_NEW; - flags = NLM_F_CREATE|NLM_F_EXCL; - } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) - return NOTIFY_DONE; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; - - b = skb->tail; - - type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = flags; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); - return NOTIFY_DONE; - -nlmsg_failure: -nfattr_failure: - kfree_skb(skb); - return NOTIFY_DONE; -} -#endif - -static int -ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ip_conntrack_expect *exp = NULL; - struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[0]; - - read_lock_bh(&ip_conntrack_lock); - list_for_each_prev(i, &ip_conntrack_expect_list) { - exp = (struct ip_conntrack_expect *) i; - if (exp->id <= *id) - continue; - if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - IPCTNL_MSG_EXP_NEW, - 1, exp) < 0) - goto out; - *id = exp->id; - } -out: - read_unlock_bh(&ip_conntrack_lock); - - return skb->len; -} - -static const size_t cta_min_exp[CTA_EXPECT_MAX] = { - [CTA_EXPECT_TIMEOUT-1] = sizeof(__be32), - [CTA_EXPECT_ID-1] = sizeof(__be32) -}; - -static int -ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_expect *exp; - struct sk_buff *skb2; - int err = 0; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct nfgenmsg *msg = NLMSG_DATA(nlh); - u32 rlen; - - if (msg->nfgen_family != AF_INET) - return -EAFNOSUPPORT; - - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_exp_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; - } - - if (cda[CTA_EXPECT_MASTER-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); - else - return -EINVAL; - - if (err < 0) - return err; - - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) - goto out; - - err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, - 1, exp); - if (err <= 0) - goto free; - - ip_conntrack_expect_put(exp); - - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - -free: - kfree_skb(skb2); -out: - ip_conntrack_expect_put(exp); - return err; -} - -static int -ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_expect *exp, *tmp; - struct ip_conntrack_tuple tuple; - struct ip_conntrack_helper *h; - int err; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (cda[CTA_EXPECT_TUPLE-1]) { - /* delete a single expect by tuple */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - - /* bump usage count to 2 */ - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = - *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - /* after list removal, usage count == 1 */ - ip_conntrack_unexpect_related(exp); - /* have to put what we 'get' above. - * after this line usage count == 0 */ - ip_conntrack_expect_put(exp); - } else if (cda[CTA_EXPECT_HELP_NAME-1]) { - char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); - - /* delete all expectations for this helper */ - write_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_helper_find_byname(name); - if (!h) { - write_unlock_bh(&ip_conntrack_lock); - return -EINVAL; - } - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, - list) { - if (exp->master->helper == h - && del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - write_unlock_bh(&ip_conntrack_lock); - } else { - /* This basically means we have to flush everything*/ - write_lock_bh(&ip_conntrack_lock); - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, - list) { - if (del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - write_unlock_bh(&ip_conntrack_lock); - } - - return 0; -} -static int -ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) -{ - return -EOPNOTSUPP; -} - -static int -ctnetlink_create_expect(struct nfattr *cda[]) -{ - struct ip_conntrack_tuple tuple, mask, master_tuple; - struct ip_conntrack_tuple_hash *h = NULL; - struct ip_conntrack_expect *exp; - struct ip_conntrack *ct; - int err = 0; - - /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); - if (err < 0) - return err; - - /* Look for master conntrack of this expectation */ - h = ip_conntrack_find_get(&master_tuple, NULL); - if (!h) - return -ENOENT; - ct = tuplehash_to_ctrack(h); - - if (!ct->helper) { - /* such conntrack hasn't got any helper, abort */ - err = -EINVAL; - goto out; - } - - exp = ip_conntrack_expect_alloc(ct); - if (!exp) { - err = -ENOMEM; - goto out; - } - - exp->expectfn = NULL; - exp->flags = 0; - exp->master = ct; - memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); - memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); - - err = ip_conntrack_expect_related(exp); - ip_conntrack_expect_put(exp); - -out: - ip_conntrack_put(tuplehash_to_ctrack(h)); - return err; -} - -static int -ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_expect *exp; - int err = 0; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (!cda[CTA_EXPECT_TUPLE-1] - || !cda[CTA_EXPECT_MASK-1] - || !cda[CTA_EXPECT_MASTER-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - - write_lock_bh(&ip_conntrack_lock); - exp = __ip_conntrack_expect_find(&tuple); - - if (!exp) { - write_unlock_bh(&ip_conntrack_lock); - err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_expect(cda); - return err; - } - - err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) - err = ctnetlink_change_expect(exp, cda); - write_unlock_bh(&ip_conntrack_lock); - - return err; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static struct notifier_block ctnl_notifier = { - .notifier_call = ctnetlink_conntrack_event, -}; - -static struct notifier_block ctnl_notifier_exp = { - .notifier_call = ctnetlink_expect_event, -}; -#endif - -static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { - [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, -}; - -static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { - [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, }, - [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, }, - [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, }, -}; - -static struct nfnetlink_subsystem ctnl_subsys = { - .name = "conntrack", - .subsys_id = NFNL_SUBSYS_CTNETLINK, - .cb_count = IPCTNL_MSG_MAX, - .cb = ctnl_cb, -}; - -static struct nfnetlink_subsystem ctnl_exp_subsys = { - .name = "conntrack_expect", - .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, - .cb_count = IPCTNL_MSG_EXP_MAX, - .cb = ctnl_exp_cb, -}; - -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); - -static int __init ctnetlink_init(void) -{ - int ret; - - printk("ctnetlink v%s: registering with nfnetlink.\n", version); - ret = nfnetlink_subsys_register(&ctnl_subsys); - if (ret < 0) { - printk("ctnetlink_init: cannot register with nfnetlink.\n"); - goto err_out; - } - - ret = nfnetlink_subsys_register(&ctnl_exp_subsys); - if (ret < 0) { - printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); - goto err_unreg_subsys; - } - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - ret = ip_conntrack_register_notifier(&ctnl_notifier); - if (ret < 0) { - printk("ctnetlink_init: cannot register notifier.\n"); - goto err_unreg_exp_subsys; - } - - ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); - if (ret < 0) { - printk("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } -#endif - - return 0; - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -err_unreg_notifier: - ip_conntrack_unregister_notifier(&ctnl_notifier); -err_unreg_exp_subsys: - nfnetlink_subsys_unregister(&ctnl_exp_subsys); -#endif -err_unreg_subsys: - nfnetlink_subsys_unregister(&ctnl_subsys); -err_out: - return ret; -} - -static void __exit ctnetlink_exit(void) -{ - printk("ctnetlink: unregistering from nfnetlink.\n"); - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); - ip_conntrack_unregister_notifier(&ctnl_notifier); -#endif - - nfnetlink_subsys_unregister(&ctnl_exp_subsys); - nfnetlink_subsys_unregister(&ctnl_subsys); - return; -} - -module_init(ctnetlink_init); -module_exit(ctnetlink_exit); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c deleted file mode 100644 index 88af82e9865..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ /dev/null @@ -1,74 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_generic_timeout __read_mostly = 600*HZ; - -static int generic_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return 1; -} - -static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int generic_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return 0; -} - -/* Print out the private part of the conntrack. */ -static int generic_print_conntrack(struct seq_file *s, - const struct ip_conntrack *state) -{ - return 0; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) -{ - return 1; -} - -struct ip_conntrack_protocol ip_conntrack_generic_protocol = -{ - .proto = 0, - .name = "unknown", - .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, - .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, - .packet = packet, - .new = new, -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c deleted file mode 100644 index ac1c49ef36a..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * ip_conntrack_proto_gre.c - Version 3.0 - * - * Connection tracking protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/in.h> -#include <linux/list.h> -#include <linux/seq_file.h> -#include <linux/interrupt.h> - -static DEFINE_RWLOCK(ip_ct_gre_lock); - -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE"); - -/* shamelessly stolen from ip_conntrack_proto_udp.c */ -#define GRE_TIMEOUT (30*HZ) -#define GRE_STREAM_TIMEOUT (180*HZ) - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \ - NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \ - NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key)) -#else -#define DEBUGP(x, args...) -#define DUMP_TUPLE_GRE(x) -#endif - -/* GRE KEYMAP HANDLING FUNCTIONS */ -static LIST_HEAD(gre_keymap_list); - -static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, - const struct ip_conntrack_tuple *t) -{ - return ((km->tuple.src.ip == t->src.ip) && - (km->tuple.dst.ip == t->dst.ip) && - (km->tuple.dst.protonum == t->dst.protonum) && - (km->tuple.dst.u.all == t->dst.u.all)); -} - -/* look up the source key for a given tuple */ -static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) -{ - struct ip_ct_gre_keymap *km; - __be16 key = 0; - - read_lock_bh(&ip_ct_gre_lock); - list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t)) { - key = km->tuple.src.u.gre.key; - break; - } - } - read_unlock_bh(&ip_ct_gre_lock); - - DEBUGP("lookup src key 0x%x up key for ", key); - DUMP_TUPLE_GRE(t); - - return key; -} - -/* add a single keymap entry, associate with specified master ct */ -int -ip_ct_gre_keymap_add(struct ip_conntrack *ct, - struct ip_conntrack_tuple *t, int reply) -{ - struct ip_ct_gre_keymap **exist_km, *km; - - if (!ct->helper || strcmp(ct->helper->name, "pptp")) { - DEBUGP("refusing to add GRE keymap to non-pptp session\n"); - return -1; - } - - if (!reply) - exist_km = &ct->help.ct_pptp_info.keymap_orig; - else - exist_km = &ct->help.ct_pptp_info.keymap_reply; - - if (*exist_km) { - /* check whether it's a retransmission */ - list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *exist_km) - return 0; - } - DEBUGP("trying to override keymap_%s for ct %p\n", - reply? "reply":"orig", ct); - return -EEXIST; - } - - km = kmalloc(sizeof(*km), GFP_ATOMIC); - if (!km) - return -ENOMEM; - - memcpy(&km->tuple, t, sizeof(*t)); - *exist_km = km; - - DEBUGP("adding new entry %p: ", km); - DUMP_TUPLE_GRE(&km->tuple); - - write_lock_bh(&ip_ct_gre_lock); - list_add_tail(&km->list, &gre_keymap_list); - write_unlock_bh(&ip_ct_gre_lock); - - return 0; -} - -/* destroy the keymap entries associated with specified master ct */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct) -{ - DEBUGP("entering for ct %p\n", ct); - - if (!ct->helper || strcmp(ct->helper->name, "pptp")) { - DEBUGP("refusing to destroy GRE keymap to non-pptp session\n"); - return; - } - - write_lock_bh(&ip_ct_gre_lock); - if (ct->help.ct_pptp_info.keymap_orig) { - DEBUGP("removing %p from list\n", - ct->help.ct_pptp_info.keymap_orig); - list_del(&ct->help.ct_pptp_info.keymap_orig->list); - kfree(ct->help.ct_pptp_info.keymap_orig); - ct->help.ct_pptp_info.keymap_orig = NULL; - } - if (ct->help.ct_pptp_info.keymap_reply) { - DEBUGP("removing %p from list\n", - ct->help.ct_pptp_info.keymap_reply); - list_del(&ct->help.ct_pptp_info.keymap_reply->list); - kfree(ct->help.ct_pptp_info.keymap_reply); - ct->help.ct_pptp_info.keymap_reply = NULL; - } - write_unlock_bh(&ip_ct_gre_lock); -} - - -/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ - -/* invert gre part of tuple */ -static int gre_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - - return 1; -} - -/* gre hdr info to tuple */ -static int gre_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct gre_hdr_pptp _pgrehdr, *pgrehdr; - __be16 srckey; - struct gre_hdr _grehdr, *grehdr; - - /* first only delinearize old RFC1701 GRE header */ - grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { - /* try to behave like "ip_conntrack_proto_generic" */ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - return 1; - } - - /* PPTP header is variable length, only need up to the call_id field */ - pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); - if (!pgrehdr) - return 1; - - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - DEBUGP("GRE_VERSION_PPTP but unknown proto\n"); - return 0; - } - - tuple->dst.u.gre.key = pgrehdr->call_id; - srckey = gre_keymap_lookup(tuple); - tuple->src.u.gre.key = srckey; - - return 1; -} - -/* print gre part of tuple */ -static int gre_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "srckey=0x%x dstkey=0x%x ", - ntohs(tuple->src.u.gre.key), - ntohs(tuple->dst.u.gre.key)); -} - -/* print private data for conntrack */ -static int gre_print_conntrack(struct seq_file *s, - const struct ip_conntrack *ct) -{ - return seq_printf(s, "timeout=%u, stream_timeout=%u ", - (ct->proto.gre.timeout / HZ), - (ct->proto.gre.stream_timeout / HZ)); -} - -/* Returns verdict for packet, and may modify conntrack */ -static int gre_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) -{ - /* If we've seen traffic both ways, this is a GRE connection. - * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.stream_timeout); - /* Also, more likely to be important, and not a probe. */ - set_bit(IPS_ASSURED_BIT, &ct->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } else - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int gre_new(struct ip_conntrack *ct, - const struct sk_buff *skb) -{ - DEBUGP(": "); - DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; - - return 1; -} - -/* Called when a conntrack entry has already been removed from the hashes - * and is about to be deleted from memory */ -static void gre_destroy(struct ip_conntrack *ct) -{ - struct ip_conntrack *master = ct->master; - DEBUGP(" entering\n"); - - if (!master) - DEBUGP("no master !?!\n"); - else - ip_ct_gre_keymap_destroy(master); -} - -/* protocol helper struct */ -static struct ip_conntrack_protocol gre = { - .proto = IPPROTO_GRE, - .name = "gre", - .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, - .print_tuple = gre_print_tuple, - .print_conntrack = gre_print_conntrack, - .packet = gre_packet, - .new = gre_new, - .destroy = gre_destroy, - .me = THIS_MODULE, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; - -/* ip_conntrack_proto_gre initialization */ -int __init ip_ct_proto_gre_init(void) -{ - return ip_conntrack_protocol_register(&gre); -} - -/* This cannot be __exit, as it is invoked from ip_conntrack_helper_pptp.c's - * init() code on errors. - */ -void ip_ct_proto_gre_fini(void) -{ - struct list_head *pos, *n; - - /* delete all keymap entries */ - write_lock_bh(&ip_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - DEBUGP("deleting keymap %p at module unload time\n", pos); - list_del(pos); - kfree(pos); - } - write_unlock_bh(&ip_ct_gre_lock); - - ip_conntrack_protocol_unregister(&gre); -} - -EXPORT_SYMBOL(ip_ct_gre_keymap_add); -EXPORT_SYMBOL(ip_ct_gre_keymap_destroy); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c deleted file mode 100644 index ad70c81a21e..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ /dev/null @@ -1,315 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/icmp.h> -#include <linux/seq_file.h> -#include <linux/skbuff.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_icmp_timeout __read_mostly = 30*HZ; - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int icmp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct icmphdr _hdr, *hp; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return 0; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return 1; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) - || !invmap[orig->dst.u.icmp.type]) - return 0; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int icmp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); -} - -/* Print out the private part of the conntrack. */ -static int icmp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - return 0; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - /* Try to delete connection immediately after all replies: - won't actually vanish as we still have skb, and del_timer - means this will only run once even if count hits zero twice - (theoretically possible with SMP) */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - } else { - atomic_inc(&ct->proto.icmp.count); - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); - } - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int icmp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); - DUMP_TUPLE(&conntrack->tuplehash[0].tuple); - return 0; - } - atomic_set(&conntrack->proto.icmp.count, 0); - return 1; -} - -static int -icmp_error_message(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct ip_conntrack_tuple innertuple, origtuple; - struct { - struct icmphdr icmp; - struct iphdr ip; - } _in, *inside; - struct ip_conntrack_protocol *innerproto; - struct ip_conntrack_tuple_hash *h; - int dataoff; - - IP_NF_ASSERT(skb->nfct == NULL); - - /* Not enough header? */ - inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); - if (inside == NULL) - return -NF_ACCEPT; - - /* Ignore ICMP's containing fragments (shouldn't happen) */ - if (inside->ip.frag_off & htons(IP_OFFSET)) { - DEBUGP("icmp_error_track: fragment of proto %u\n", - inside->ip.protocol); - return -NF_ACCEPT; - } - - innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; - /* Are they talking about one of our connections? */ - if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { - DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); - ip_conntrack_proto_put(innerproto); - return -NF_ACCEPT; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { - DEBUGP("icmp_error_track: Can't invert tuple\n"); - ip_conntrack_proto_put(innerproto); - return -NF_ACCEPT; - } - ip_conntrack_proto_put(innerproto); - - *ctinfo = IP_CT_RELATED; - - h = ip_conntrack_find_get(&innertuple, NULL); - if (!h) { - /* Locally generated ICMPs will match inverted if they - haven't been SNAT'ed yet */ - /* FIXME: NAT code has to handle half-done double NAT --RR */ - if (hooknum == NF_IP_LOCAL_OUT) - h = ip_conntrack_find_get(&origtuple, NULL); - - if (!h) { - DEBUGP("icmp_error_track: no match\n"); - return -NF_ACCEPT; - } - /* Reverse direction from that found */ - if (DIRECTION(h) != IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } else { - if (DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - skb->nfct = &tuplehash_to_ctrack(h)->ct_general; - skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct icmphdr _ih, *icmph; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); - if (icmph == NULL) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: short packet "); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad ICMP checksum "); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: invalid ICMP type "); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH - && icmph->type != ICMP_SOURCE_QUENCH - && icmph->type != ICMP_TIME_EXCEEDED - && icmph->type != ICMP_PARAMETERPROB - && icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(skb, ctinfo, hooknum); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -static int icmp_tuple_to_nfattr(struct sk_buff *skb, - const struct ip_conntrack_tuple *t) -{ - NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(__be16), - &t->src.u.icmp.id); - NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), - &t->dst.u.icmp.type); - NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), - &t->dst.u.icmp.code); - - return 0; - -nfattr_failure: - return -1; -} - -static int icmp_nfattr_to_tuple(struct nfattr *tb[], - struct ip_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1] - || !tb[CTA_PROTO_ICMP_ID-1]) - return -EINVAL; - - tuple->dst.u.icmp.type = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); - tuple->dst.u.icmp.code = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); - tuple->src.u.icmp.id = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) - || !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} -#endif - -struct ip_conntrack_protocol ip_conntrack_protocol_icmp = -{ - .proto = IPPROTO_ICMP, - .name = "icmp", - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .print_tuple = icmp_print_tuple, - .print_conntrack = icmp_print_conntrack, - .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = icmp_tuple_to_nfattr, - .nfattr_to_tuple = icmp_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c deleted file mode 100644 index e6942992b2f..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * Connection tracking protocol helper module for SCTP. - * - * SCTP is defined in RFC 2960. References to various sections in this code - * are to this RFC. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * Added support for proc manipulation of timeouts. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/interrupt.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/sctp.h> -#include <linux/string.h> -#include <linux/seq_file.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -#if 0 -#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.sctp */ -static DEFINE_RWLOCK(sctp_lock); - -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - -static const char *sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", -}; - -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -static unsigned int ip_ct_sctp_timeout_closed __read_mostly = 10 SECS; -static unsigned int ip_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS; -static unsigned int ip_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS; -static unsigned int ip_ct_sctp_timeout_established __read_mostly = 5 DAYS; -static unsigned int ip_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000; -static unsigned int ip_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000; -static unsigned int ip_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS; - -static const unsigned int * sctp_timeouts[] -= { NULL, /* SCTP_CONNTRACK_NONE */ - &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ - &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ - &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ - &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ - &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ - &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ - &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ - }; - -#define sNO SCTP_CONNTRACK_NONE -#define sCL SCTP_CONNTRACK_CLOSED -#define sCW SCTP_CONNTRACK_COOKIE_WAIT -#define sCE SCTP_CONNTRACK_COOKIE_ECHOED -#define sES SCTP_CONNTRACK_ESTABLISHED -#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT -#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD -#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT -#define sIV SCTP_CONNTRACK_MAX - -/* - These are the descriptions of the states: - -NOTE: These state names are tantalizingly similar to the states of an -SCTP endpoint. But the interpretation of the states is a little different, -considering that these are the states of the connection and not of an end -point. Please note the subtleties. -Kiran - -NONE - Nothing so far. -COOKIE WAIT - We have seen an INIT chunk in the original direction, or also - an INIT_ACK chunk in the reply direction. -COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. -ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. -SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. -SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite - to that of the SHUTDOWN chunk. -CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - the SHUTDOWN chunk. Connection is closed. -*/ - -/* TODO - - I have assumed that the first INIT is in the original direction. - This messes things when an INIT comes in the reply direction in CLOSED - state. - - Check the error type in the reply dir before transitioning from -cookie echoed to closed. - - Sec 5.2.4 of RFC 2960 - - Multi Homing support. -*/ - -/* SCTP conntrack state transitions */ -static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { - { -/* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} - }, - { -/* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} - } -}; - -static int sctp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - sctp_sctphdr_t _hdr, *hp; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.sctp.port = hp->source; - tuple->dst.u.sctp.port = hp->dest; - return 1; -} - -static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - tuple->src.u.sctp.port = orig->dst.u.sctp.port; - tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int sctp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.sctp.port), - ntohs(tuple->dst.u.sctp.port)); -} - -/* Print out the private part of the conntrack. */ -static int sctp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - enum sctp_conntrack state; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - read_lock_bh(&sctp_lock); - state = conntrack->proto.sctp.state; - read_unlock_bh(&sctp_lock); - - return seq_printf(s, "%s ", sctp_conntrack_names[state]); -} - -#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \ -for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \ - offset < skb->len && \ - (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ - offset += (ntohs(sch->length) + 3) & ~3, count++) - -/* Some validity checks to make sure the chunks are fine */ -static int do_basic_checks(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - char *map) -{ - u_int32_t offset, count; - sctp_chunkhdr_t _sch, *sch; - int flag; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - flag = 0; - - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); - - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK - || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - flag = 1; - } - - /* - * Cookie Ack/Echo chunks not the first OR - * Init / Init Ack / Shutdown compl chunks not the only chunks - * OR zero-length. - */ - if (((sch->type == SCTP_CID_COOKIE_ACK - || sch->type == SCTP_CID_COOKIE_ECHO - || flag) - && count !=0) || !sch->length) { - DEBUGP("Basic checks failed\n"); - return 1; - } - - if (map) { - set_bit(sch->type, (void *)map); - } - } - - DEBUGP("Basic checks passed\n"); - return count == 0; -} - -static int new_state(enum ip_conntrack_dir dir, - enum sctp_conntrack cur_state, - int chunk_type) -{ - int i; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - DEBUGP("Chunk type: %d\n", chunk_type); - - switch (chunk_type) { - case SCTP_CID_INIT: - DEBUGP("SCTP_CID_INIT\n"); - i = 0; break; - case SCTP_CID_INIT_ACK: - DEBUGP("SCTP_CID_INIT_ACK\n"); - i = 1; break; - case SCTP_CID_ABORT: - DEBUGP("SCTP_CID_ABORT\n"); - i = 2; break; - case SCTP_CID_SHUTDOWN: - DEBUGP("SCTP_CID_SHUTDOWN\n"); - i = 3; break; - case SCTP_CID_SHUTDOWN_ACK: - DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); - i = 4; break; - case SCTP_CID_ERROR: - DEBUGP("SCTP_CID_ERROR\n"); - i = 5; break; - case SCTP_CID_COOKIE_ECHO: - DEBUGP("SCTP_CID_COOKIE_ECHO\n"); - i = 6; break; - case SCTP_CID_COOKIE_ACK: - DEBUGP("SCTP_CID_COOKIE_ACK\n"); - i = 7; break; - case SCTP_CID_SHUTDOWN_COMPLETE: - DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); - i = 8; break; - default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ - DEBUGP("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); - return cur_state; - } - - DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - - return sctp_conntracks[dir][i][cur_state]; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int sctp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - enum sctp_conntrack newconntrack, oldsctpstate; - struct iphdr *iph = skb->nh.iph; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; - u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); - if (sh == NULL) - return -1; - - if (do_basic_checks(conntrack, skb, map) != 0) - return -1; - - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) - && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) - && !test_bit(SCTP_CID_ABORT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) - && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - DEBUGP("Verification tag check failed\n"); - return -1; - } - - oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - write_lock_bh(&sctp_lock); - - /* Special cases of Verification tag check (Sec 8.5.1) */ - if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ - if (sh->vtag != 0) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)] - && (sch->flags & 1))) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } - } - - oldsctpstate = conntrack->proto.sctp.state; - newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); - - /* Invalid */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", - CTINFO2DIR(ctinfo), sch->type, oldsctpstate); - write_unlock_bh(&sctp_lock); - return -1; - } - - /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK) { - sctp_inithdr_t _inithdr, *ih; - - ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) { - write_unlock_bh(&sctp_lock); - return -1; - } - DEBUGP("Setting vtag %x for dir %d\n", - ih->init_tag, !CTINFO2DIR(ctinfo)); - conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; - } - - conntrack->proto.sctp.state = newconntrack; - if (oldsctpstate != newconntrack) - ip_conntrack_event_cache(IPCT_PROTOINFO, skb); - write_unlock_bh(&sctp_lock); - } - - ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); - - if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED - && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY - && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { - DEBUGP("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &conntrack->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int sctp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - enum sctp_conntrack newconntrack; - struct iphdr *iph = skb->nh.iph; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; - u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); - if (sh == NULL) - return 0; - - if (do_basic_checks(conntrack, skb, map) != 0) - return 0; - - /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if ((test_bit (SCTP_CID_ABORT, (void *)map)) - || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) - || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { - return 0; - } - - newconntrack = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - /* Don't need lock here: this conntrack not in circulation yet */ - newconntrack = new_state (IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); - - /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); - return 0; - } - - /* Copy the vtag into the state info */ - if (sch->type == SCTP_CID_INIT) { - if (sh->vtag == 0) { - sctp_inithdr_t _inithdr, *ih; - - ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) - return 0; - - DEBUGP("Setting vtag %x for new conn\n", - ih->init_tag); - - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = - ih->init_tag; - } else { - /* Sec 8.5.1 (A) */ - return 0; - } - } - /* If it is a shutdown ack OOTB packet, we expect a return - shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ - else { - DEBUGP("Setting vtag %x for new conn OOTB\n", - sh->vtag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; - } - - conntrack->proto.sctp.state = newconntrack; - } - - return 1; -} - -static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { - .proto = IPPROTO_SCTP, - .name = "sctp", - .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, - .print_tuple = sctp_print_tuple, - .print_conntrack = sctp_print_conntrack, - .packet = sctp_packet, - .new = sctp_new, - .destroy = NULL, - .me = THIS_MODULE, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; - -#ifdef CONFIG_SYSCTL -static ctl_table ip_ct_sysctl_table[] = { - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, - .procname = "ip_conntrack_sctp_timeout_closed", - .data = &ip_ct_sctp_timeout_closed, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, - .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .data = &ip_ct_sctp_timeout_cookie_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, - .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .data = &ip_ct_sctp_timeout_cookie_echoed, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, - .procname = "ip_conntrack_sctp_timeout_established", - .data = &ip_ct_sctp_timeout_established, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, - .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .data = &ip_ct_sctp_timeout_shutdown_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, - .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .data = &ip_ct_sctp_timeout_shutdown_recd, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, - .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &ip_ct_sctp_timeout_shutdown_ack_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - .child = ip_ct_sysctl_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ip_ct_netfilter_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ip_ct_ipv4_table, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header *ip_ct_sysctl_header; -#endif - -static int __init ip_conntrack_proto_sctp_init(void) -{ - int ret; - - ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); - if (ret) { - printk("ip_conntrack_proto_sctp: protocol register failed\n"); - goto out; - } - -#ifdef CONFIG_SYSCTL - ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table); - if (ip_ct_sysctl_header == NULL) { - ret = -ENOMEM; - printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); - goto cleanup; - } -#endif - - return ret; - -#ifdef CONFIG_SYSCTL - cleanup: - ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); -#endif - out: - DEBUGP("SCTP conntrack module loading %s\n", - ret ? "failed": "succeeded"); - return ret; -} - -static void __exit ip_conntrack_proto_sctp_fini(void) -{ - ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ip_ct_sysctl_header); -#endif - DEBUGP("SCTP conntrack module unloaded\n"); -} - -module_init(ip_conntrack_proto_sctp_init); -module_exit(ip_conntrack_proto_sctp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c deleted file mode 100644 index 0a72eab1462..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ /dev/null @@ -1,1164 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: - * - Real stateful connection tracking - * - Modified state transitions table - * - Window scaling support added - * - SACK support added - * - * Willy Tarreau: - * - State table bugfixes - * - More robust state changes - * - Tuning timer parameters - * - * version 2.2 - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/spinlock.h> - -#include <net/tcp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -#if 0 -#define DEBUGP printk -#define DEBUGP_VARS -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.tcp */ -static DEFINE_RWLOCK(tcp_lock); - -/* "Be conservative in what you do, - be liberal in what you accept from others." - If it's non-zero, we mark only out of window RST segments as INVALID. */ -int ip_ct_tcp_be_liberal __read_mostly = 0; - -/* If it is set to zero, we disable picking up already established - connections. */ -int ip_ct_tcp_loose __read_mostly = 1; - -/* Max number of the retransmitted packets without receiving an (acceptable) - ACK from the destination. If this number is reached, a shorter timer - will be started. */ -int ip_ct_tcp_max_retrans __read_mostly = 3; - - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR */ - -static const char *tcp_conntrack_names[] = { - "NONE", - "SYN_SENT", - "SYN_RECV", - "ESTABLISHED", - "FIN_WAIT", - "CLOSE_WAIT", - "LAST_ACK", - "TIME_WAIT", - "CLOSE", - "LISTEN" -}; - -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -unsigned int ip_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -unsigned int ip_ct_tcp_timeout_established __read_mostly = 5 DAYS; -unsigned int ip_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -unsigned int ip_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -unsigned int ip_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_close __read_mostly = 10 SECS; - -/* RFC1122 says the R2 limit should be at least 100 seconds. - Linux uses 15 packets as limit, which corresponds - to ~13-30min depending on RTO. */ -unsigned int ip_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; - -static const unsigned int * tcp_timeouts[] -= { NULL, /* TCP_CONNTRACK_NONE */ - &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ - &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ - &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ - &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ - &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ - NULL, /* TCP_CONNTRACK_LISTEN */ - }; - -#define sNO TCP_CONNTRACK_NONE -#define sSS TCP_CONNTRACK_SYN_SENT -#define sSR TCP_CONNTRACK_SYN_RECV -#define sES TCP_CONNTRACK_ESTABLISHED -#define sFW TCP_CONNTRACK_FIN_WAIT -#define sCW TCP_CONNTRACK_CLOSE_WAIT -#define sLA TCP_CONNTRACK_LAST_ACK -#define sTW TCP_CONNTRACK_TIME_WAIT -#define sCL TCP_CONNTRACK_CLOSE -#define sLI TCP_CONNTRACK_LISTEN -#define sIV TCP_CONNTRACK_MAX -#define sIG TCP_CONNTRACK_IGNORE - -/* What TCP flags are set from RST/SYN/FIN/ACK. */ -enum tcp_bit_set { - TCP_SYN_SET, - TCP_SYNACK_SET, - TCP_FIN_SET, - TCP_ACK_SET, - TCP_RST_SET, - TCP_NONE_SET, -}; - -/* - * The TCP state transition table needs a few words... - * - * We are the man in the middle. All the packets go through us - * but might get lost in transit to the destination. - * It is assumed that the destinations can't receive segments - * we haven't seen. - * - * The checked segment is in window, but our windows are *not* - * equivalent with the ones of the sender/receiver. We always - * try to guess the state of the current sender. - * - * The meaning of the states are: - * - * NONE: initial state - * SYN_SENT: SYN-only packet seen - * SYN_RECV: SYN-ACK packet seen - * ESTABLISHED: ACK packet seen - * FIN_WAIT: FIN packet seen - * CLOSE_WAIT: ACK seen (after FIN) - * LAST_ACK: FIN seen (after FIN) - * TIME_WAIT: last ACK seen - * CLOSE: closed connection - * - * LISTEN state is not used. - * - * Packets marked as IGNORED (sIG): - * if they may be either invalid or valid - * and the receiver may send back a connection - * closing RST or a SYN/ACK. - * - * Packets marked as INVALID (sIV): - * if they are invalid - * or we do not support the request (simultaneous open) - */ -static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { - { -/* ORIGINAL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, -/* - * sNO -> sSS Initialize a new connection - * sSS -> sSS Retransmitted SYN - * sSR -> sIG Late retransmitted SYN? - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sSS Reopened connection (RFC 1122). - * sCL -> sSS - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, -/* - * A SYN/ACK from the client is always invalid: - * - either it tries to set up a simultaneous open, which is - * not supported; - * - or the firewall has just been inserted between the two hosts - * during the session set-up. The SYN will be retransmitted - * by the true client (or it'll time out). - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, -/* - * sNO -> sIV Too late and no reason to do anything... - * sSS -> sIV Client migth not send FIN in this state: - * we enforce waiting for a SYN/ACK reply first. - * sSR -> sFW Close started. - * sES -> sFW - * sFW -> sLA FIN seen in both directions, waiting for - * the last ACK. - * Migth be a retransmitted FIN as well... - * sCW -> sLA - * sLA -> sLA Retransmitted FIN. Remain in the same state. - * sTW -> sTW - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, -/* - * sNO -> sES Assumed. - * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. - * sSR -> sES Established state is reached. - * sES -> sES :-) - * sFW -> sCW Normal close request answered by ACK. - * sCW -> sCW - * sLA -> sTW Last ACK detected. - * sTW -> sTW Retransmitted last ACK. Remain in the same state. - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, -/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } - }, - { -/* REPLY */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, -/* - * sNO -> sIV Never reached. - * sSS -> sIV Simultaneous open, not supported - * sSR -> sIV Simultaneous open, not supported. - * sES -> sIV Server may not initiate a connection. - * sFW -> sIV - * sCW -> sIV - * sLA -> sIV - * sTW -> sIV Reopened connection, but server may not do it. - * sCL -> sIV - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, -/* - * sSS -> sSR Standard open. - * sSR -> sSR Retransmitted SYN/ACK. - * sES -> sIG Late retransmitted SYN/ACK? - * sFW -> sIG Might be SYN/ACK answering ignored SYN - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, -/* - * sSS -> sIV Server might not send FIN in this state. - * sSR -> sFW Close started. - * sES -> sFW - * sFW -> sLA FIN seen in both directions. - * sCW -> sLA - * sLA -> sLA Retransmitted FIN. - * sTW -> sTW - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, -/* - * sSS -> sIG Might be a half-open connection. - * sSR -> sSR Might answer late resent SYN. - * sES -> sES :-) - * sFW -> sCW Normal close request answered by ACK. - * sCW -> sCW - * sLA -> sTW Last ACK detected. - * sTW -> sTW Retransmitted last ACK. - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, -/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } - } -}; - -static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct tcphdr _hdr, *hp; - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.tcp.port = hp->source; - tuple->dst.u.tcp.port = hp->dest; - - return 1; -} - -static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.tcp.port = orig->dst.u.tcp.port; - tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int tcp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.tcp.port), - ntohs(tuple->dst.u.tcp.port)); -} - -/* Print out the private part of the conntrack. */ -static int tcp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - enum tcp_conntrack state; - - read_lock_bh(&tcp_lock); - state = conntrack->proto.tcp.state; - read_unlock_bh(&tcp_lock); - - return seq_printf(s, "%s ", tcp_conntrack_names[state]); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, - const struct ip_conntrack *ct) -{ - struct nfattr *nest_parms; - - read_lock_bh(&tcp_lock); - nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); - NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), - &ct->proto.tcp.state); - read_unlock_bh(&tcp_lock); - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - read_unlock_bh(&tcp_lock); - return -1; -} - -static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { - [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), -}; - -static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) -{ - struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; - struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - - /* updates could not contain anything about the private - * protocol info, in that case skip the parsing */ - if (!attr) - return 0; - - nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) - return -EINVAL; - - if (!tb[CTA_PROTOINFO_TCP_STATE-1]) - return -EINVAL; - - write_lock_bh(&tcp_lock); - ct->proto.tcp.state = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); - write_unlock_bh(&tcp_lock); - - return 0; -} -#endif - -static unsigned int get_conntrack_index(const struct tcphdr *tcph) -{ - if (tcph->rst) return TCP_RST_SET; - else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); - else if (tcph->fin) return TCP_FIN_SET; - else if (tcph->ack) return TCP_ACK_SET; - else return TCP_NONE_SET; -} - -/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering - in IP Filter' by Guido van Rooij. - - http://www.nluug.nl/events/sane2000/papers.html - http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz - - The boundaries and the conditions are changed according to RFC793: - the packet must intersect the window (i.e. segments may be - after the right or before the left edge) and thus receivers may ACK - segments after the right edge of the window. - - td_maxend = max(sack + max(win,1)) seen in reply packets - td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets - td_maxwin += seq + len - sender.td_maxend - if seq + len > sender.td_maxend - td_end = max(seq + len) seen in sent packets - - I. Upper bound for valid data: seq <= sender.td_maxend - II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin - III. Upper bound for valid ack: sack <= receiver.td_end - IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW - - where sack is the highest right edge of sack block found in the packet. - - The upper bound limit for a valid ack is not ignored - - we doesn't have to deal with fragments. -*/ - -static inline __u32 segment_seq_plus_len(__u32 seq, - size_t len, - struct iphdr *iph, - struct tcphdr *tcph) -{ - return (seq + len - (iph->ihl + tcph->doff)*4 - + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); -} - -/* Fixme: what about big packets? */ -#define MAXACKWINCONST 66000 -#define MAXACKWINDOW(sender) \ - ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ - : MAXACKWINCONST) - -/* - * Simplified tcp_parse_options routine from tcp_input.c - */ -static void tcp_options(const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph, - struct ip_ct_tcp_state *state) -{ - unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; - int length = (tcph->doff*4) - sizeof(struct tcphdr); - - if (!length) - return; - - ptr = skb_header_pointer(skb, - (iph->ihl * 4) + sizeof(struct tcphdr), - length, buff); - BUG_ON(ptr == NULL); - - state->td_scale = - state->flags = 0; - - while (length > 0) { - int opcode=*ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - - if (opcode == TCPOPT_SACK_PERM - && opsize == TCPOLEN_SACK_PERM) - state->flags |= IP_CT_TCP_FLAG_SACK_PERM; - else if (opcode == TCPOPT_WINDOW - && opsize == TCPOLEN_WINDOW) { - state->td_scale = *(u_int8_t *)ptr; - - if (state->td_scale > 14) { - /* See RFC1323 */ - state->td_scale = 14; - } - state->flags |= - IP_CT_TCP_FLAG_WINDOW_SCALE; - } - ptr += opsize - 2; - length -= opsize; - } - } -} - -static void tcp_sack(const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph, - __u32 *sack) -{ - unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; - int length = (tcph->doff*4) - sizeof(struct tcphdr); - __u32 tmp; - - if (!length) - return; - - ptr = skb_header_pointer(skb, - (iph->ihl * 4) + sizeof(struct tcphdr), - length, buff); - BUG_ON(ptr == NULL); - - /* Fast path for timestamp-only option */ - if (length == TCPOLEN_TSTAMP_ALIGNED*4 - && *(__be32 *)ptr == - __constant_htonl((TCPOPT_NOP << 24) - | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return; - - while (length > 0) { - int opcode=*ptr++; - int opsize, i; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - - if (opcode == TCPOPT_SACK - && opsize >= (TCPOLEN_SACK_BASE - + TCPOLEN_SACK_PERBLOCK) - && !((opsize - TCPOLEN_SACK_BASE) - % TCPOLEN_SACK_PERBLOCK)) { - for (i = 0; - i < (opsize - TCPOLEN_SACK_BASE); - i += TCPOLEN_SACK_PERBLOCK) { - tmp = ntohl(*((__be32 *)(ptr+i)+1)); - - if (after(tmp, *sack)) - *sack = tmp; - } - return; - } - ptr += opsize - 2; - length -= opsize; - } - } -} - -static int tcp_in_window(struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct ip_ct_tcp_state *sender = &state->seen[dir]; - struct ip_ct_tcp_state *receiver = &state->seen[!dir]; - __u32 seq, ack, sack, end, win, swin; - int res; - - /* - * Get the required data from the packet. - */ - seq = ntohl(tcph->seq); - ack = sack = ntohl(tcph->ack_seq); - win = ntohs(tcph->window); - end = segment_seq_plus_len(seq, skb->len, iph, tcph); - - if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) - tcp_sack(skb, iph, tcph, &sack); - - DEBUGP("tcp_in_window: START\n"); - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack=%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - if (sender->td_end == 0) { - /* - * Initialize sender data. - */ - if (tcph->syn && tcph->ack) { - /* - * Outgoing SYN-ACK in reply to a SYN. - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, iph, tcph, sender); - /* - * RFC 1323: - * Both sides must send the Window Scale option - * to enable window scaling in either direction. - */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE - && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) - sender->td_scale = - receiver->td_scale = 0; - } else { - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - sender->td_end = end; - sender->td_maxwin = (win == 0 ? 1 : win); - sender->td_maxend = end + sender->td_maxwin; - } - } else if (((state->state == TCP_CONNTRACK_SYN_SENT - && dir == IP_CT_DIR_ORIGINAL) - || (state->state == TCP_CONNTRACK_SYN_RECV - && dir == IP_CT_DIR_REPLY)) - && after(end, sender->td_end)) { - /* - * RFC 793: "if a TCP is reinitialized ... then it need - * not wait at all; it must only be sure to use sequence - * numbers larger than those recently used." - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, iph, tcph, sender); - } - - if (!(tcph->ack)) { - /* - * If there is no ACK, just pretend it was set and OK. - */ - ack = sack = receiver->td_end; - } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == - (TCP_FLAG_ACK|TCP_FLAG_RST)) - && (ack == 0)) { - /* - * Broken TCP stacks, that set ACK in RST packets as well - * with zero ack value. - */ - ack = sack = receiver->td_end; - } - - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) - /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. - */ - seq = end = sender->td_end; - - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack =%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", - before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), - before(sack, receiver->td_end + 1), - after(ack, receiver->td_end - MAXACKWINDOW(sender))); - - if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && - before(sack, receiver->td_end + 1) && - after(ack, receiver->td_end - MAXACKWINDOW(sender))) { - /* - * Take into account window scaling (RFC 1323). - */ - if (!tcph->syn) - win <<= sender->td_scale; - - /* - * Update sender data. - */ - swin = win + (sack - ack); - if (sender->td_maxwin < swin) - sender->td_maxwin = swin; - if (after(end, sender->td_end)) - sender->td_end = end; - /* - * Update receiver data. - */ - if (after(end, sender->td_maxend)) - receiver->td_maxwin += end - sender->td_maxend; - if (after(sack + win, receiver->td_maxend - 1)) { - receiver->td_maxend = sack + win; - if (win == 0) - receiver->td_maxend++; - } - - /* - * Check retransmissions. - */ - if (index == TCP_ACK_SET) { - if (state->last_dir == dir - && state->last_seq == seq - && state->last_ack == ack - && state->last_end == end - && state->last_win == win) - state->retrans++; - else { - state->last_dir = dir; - state->last_seq = seq; - state->last_ack = ack; - state->last_end = end; - state->last_win = win; - state->retrans = 0; - } - } - res = 1; - } else { - res = 0; - if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - ip_ct_tcp_be_liberal) - res = 1; - if (!res && LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: %s ", - before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? - before(sack, receiver->td_end + 1) ? - after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" - : "ACK is under the lower bound (possible overly delayed ACK)" - : "ACK is over the upper bound (ACKed data not seen yet)" - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); - } - - DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " - "receiver end=%u maxend=%u maxwin=%u\n", - res, sender->td_end, sender->td_maxend, sender->td_maxwin, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin); - - return res; -} - -#ifdef CONFIG_IP_NF_NAT_NEEDED -/* Update sender->td_end after NAT successfully mangled the packet */ -void ip_conntrack_tcp_update(struct sk_buff *skb, - struct ip_conntrack *conntrack, - enum ip_conntrack_dir dir) -{ - struct iphdr *iph = skb->nh.iph; - struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; - __u32 end; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; -#endif - - end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); - - write_lock_bh(&tcp_lock); - /* - * We have to worry for the ack in the reply packet only... - */ - if (after(end, conntrack->proto.tcp.seen[dir].td_end)) - conntrack->proto.tcp.seen[dir].td_end = end; - conntrack->proto.tcp.last_end = end; - write_unlock_bh(&tcp_lock); - DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); -} - -#endif - -#define TH_FIN 0x01 -#define TH_SYN 0x02 -#define TH_RST 0x04 -#define TH_PUSH 0x08 -#define TH_ACK 0x10 -#define TH_URG 0x20 -#define TH_ECE 0x40 -#define TH_CWR 0x80 - -/* table of valid flag combinations - ECE and CWR are always valid */ -static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = -{ - [TH_SYN] = 1, - [TH_SYN|TH_PUSH] = 1, - [TH_SYN|TH_URG] = 1, - [TH_SYN|TH_PUSH|TH_URG] = 1, - [TH_SYN|TH_ACK] = 1, - [TH_SYN|TH_ACK|TH_PUSH] = 1, - [TH_RST] = 1, - [TH_RST|TH_ACK] = 1, - [TH_RST|TH_ACK|TH_PUSH] = 1, - [TH_FIN|TH_ACK] = 1, - [TH_ACK] = 1, - [TH_ACK|TH_PUSH] = 1, - [TH_ACK|TH_URG] = 1, - [TH_ACK|TH_URG|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_URG] = 1, - [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, -}; - -/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct iphdr *iph = skb->nh.iph; - struct tcphdr _tcph, *th; - unsigned int tcplen = skb->len - iph->ihl * 4; - u_int8_t tcpflags; - - /* Smaller that minimal TCP header? */ - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: short packet "); - return -NF_ACCEPT; - } - - /* Not whole TCP header or malformed packet */ - if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: truncated/malformed packet "); - return -NF_ACCEPT; - } - - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because it is assumed to be correct. - */ - /* FIXME: Source route IP option packets --RR */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: bad TCP checksum "); - return -NF_ACCEPT; - } - - /* Check TCP flags. */ - tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); - if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid TCP flag combination "); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int tcp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - enum tcp_conntrack new_state, old_state; - enum ip_conntrack_dir dir; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th, _tcph; - unsigned long timeout; - unsigned int index; - - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - write_lock_bh(&tcp_lock); - old_state = conntrack->proto.tcp.state; - dir = CTINFO2DIR(ctinfo); - index = get_conntrack_index(th); - new_state = tcp_conntracks[dir][index][old_state]; - - switch (new_state) { - case TCP_CONNTRACK_IGNORE: - /* Ignored packets: - * - * a) SYN in ORIGINAL - * b) SYN/ACK in REPLY - * c) ACK in reply direction after initial SYN in original. - */ - if (index == TCP_SYNACK_SET - && conntrack->proto.tcp.last_index == TCP_SYN_SET - && conntrack->proto.tcp.last_dir != dir - && ntohl(th->ack_seq) == - conntrack->proto.tcp.last_end) { - /* This SYN/ACK acknowledges a SYN that we earlier - * ignored as invalid. This means that the client and - * the server are both in sync, while the firewall is - * not. We kill this session and block the SYN/ACK so - * that the client cannot but retransmit its SYN and - * thus initiate a clean new session. - */ - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - NULL, "ip_ct_tcp: " - "killing out of sync session "); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_DROP; - } - conntrack->proto.tcp.last_index = index; - conntrack->proto.tcp.last_dir = dir; - conntrack->proto.tcp.last_seq = ntohl(th->seq); - conntrack->proto.tcp.last_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); - - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid packet ignored "); - return NF_ACCEPT; - case TCP_CONNTRACK_MAX: - /* Invalid packet */ - DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", - dir, get_conntrack_index(th), - old_state); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid state "); - return -NF_ACCEPT; - case TCP_CONNTRACK_SYN_SENT: - if (old_state < TCP_CONNTRACK_TIME_WAIT) - break; - if ((conntrack->proto.tcp.seen[dir].flags & - IP_CT_TCP_FLAG_CLOSE_INIT) - || after(ntohl(th->seq), - conntrack->proto.tcp.seen[dir].td_end)) { - /* Attempt to reopen a closed connection. - * Delete this connection and look up again. */ - write_unlock_bh(&tcp_lock); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_REPEAT; - } else { - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - NULL, "ip_ct_tcp: invalid SYN"); - return -NF_ACCEPT; - } - case TCP_CONNTRACK_CLOSE: - if (index == TCP_RST_SET - && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET) - || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_ACK_SET)) - && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN or ACK we had let through - * at a) and c) above: - * - * a) SYN was in window then - * c) we hold a half-open connection. - * - * Delete our connection entry. - * We skip window checking, because packet might ACK - * segments we ignored. */ - goto in_window; - } - /* Just fall through */ - default: - /* Keep compilers happy. */ - break; - } - - if (!tcp_in_window(&conntrack->proto.tcp, dir, index, - skb, iph, th)) { - write_unlock_bh(&tcp_lock); - return -NF_ACCEPT; - } - in_window: - /* From now on we have got in-window packets */ - conntrack->proto.tcp.last_index = index; - - DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest), - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - - conntrack->proto.tcp.state = new_state; - if (old_state != new_state - && (new_state == TCP_CONNTRACK_FIN_WAIT - || new_state == TCP_CONNTRACK_CLOSE)) - conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans - ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - if (new_state != old_state) - ip_conntrack_event_cache(IPCT_PROTOINFO, skb); - - if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - /* If only reply is a RST, we can consider ourselves not to - have an established connection: this is a fairly common - problem case, so we can delete the conntrack - immediately. --RR */ - if (th->rst) { - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return NF_ACCEPT; - } - } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && (old_state == TCP_CONNTRACK_SYN_RECV - || old_state == TCP_CONNTRACK_ESTABLISHED) - && new_state == TCP_CONNTRACK_ESTABLISHED) { - /* Set ASSURED if we see see valid ack in ESTABLISHED - after SYN_RECV or a valid answer for a picked up - connection. */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } - ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int tcp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - enum tcp_conntrack new_state; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th, _tcph; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; -#endif - - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - /* Don't need lock here: this conntrack not in circulation yet */ - new_state - = tcp_conntracks[0][get_conntrack_index(th)] - [TCP_CONNTRACK_NONE]; - - /* Invalid: delete conntrack */ - if (new_state >= TCP_CONNTRACK_MAX) { - DEBUGP("ip_ct_tcp: invalid new deleting.\n"); - return 0; - } - - if (new_state == TCP_CONNTRACK_SYN_SENT) { - /* SYN packet */ - conntrack->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - iph, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end; - - tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); - conntrack->proto.tcp.seen[1].flags = 0; - } else if (ip_ct_tcp_loose == 0) { - /* Don't try to pick up connections. */ - return 0; - } else { - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - conntrack->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - iph, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end + - conntrack->proto.tcp.seen[0].td_maxwin; - conntrack->proto.tcp.seen[0].td_scale = 0; - - /* We assume SACK and liberal window checking to handle - * window scaling */ - conntrack->proto.tcp.seen[0].flags = - conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; - } - - conntrack->proto.tcp.seen[1].td_end = 0; - conntrack->proto.tcp.seen[1].td_maxend = 0; - conntrack->proto.tcp.seen[1].td_maxwin = 1; - conntrack->proto.tcp.seen[1].td_scale = 0; - - /* tcp_packet will set them */ - conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; - conntrack->proto.tcp.last_index = TCP_NONE_SET; - - DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - return 1; -} - -struct ip_conntrack_protocol ip_conntrack_protocol_tcp = -{ - .proto = IPPROTO_TCP, - .name = "tcp", - .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, - .print_tuple = tcp_print_tuple, - .print_conntrack = tcp_print_conntrack, - .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .to_nfattr = tcp_to_nfattr, - .from_nfattr = nfattr_to_tcp, - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c deleted file mode 100644 index 14c30c646c7..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ /dev/null @@ -1,148 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/seq_file.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_udp_timeout __read_mostly = 30*HZ; -unsigned int ip_ct_udp_timeout_stream __read_mostly = 180*HZ; - -static int udp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct udphdr _hdr, *hp; - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.udp.port = hp->source; - tuple->dst.u.udp.port = hp->dest; - - return 1; -} - -static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int udp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.udp.port), - ntohs(tuple->dst.u.udp.port)); -} - -/* Print out the private part of the conntrack. */ -static int udp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - return 0; -} - -/* Returns verdict for packet, and may modify conntracktype */ -static int udp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - /* If we've seen traffic both ways, this is some kind of UDP - stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - ip_ct_refresh_acct(conntrack, ctinfo, skb, - ip_ct_udp_timeout_stream); - /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) - ip_conntrack_event_cache(IPCT_STATUS, skb); - } else - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) -{ - return 1; -} - -static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct iphdr *iph = skb->nh.iph; - unsigned int udplen = skb->len - iph->ihl * 4; - struct udphdr _hdr, *hdr; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); - if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: short packet "); - return -NF_ACCEPT; - } - - /* Truncated/malformed packets */ - if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: truncated/malformed packet "); - return -NF_ACCEPT; - } - - /* Packet with no checksum */ - if (!hdr->check) - return NF_ACCEPT; - - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because the checksum is assumed to be correct. - * FIXME: Source route IP option packets --RR */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: bad UDP checksum "); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -struct ip_conntrack_protocol ip_conntrack_protocol_udp = -{ - .proto = IPPROTO_UDP, - .name = "udp", - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, - .packet = udp_packet, - .new = udp_new, - .error = udp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c deleted file mode 100644 index c59a962c1f6..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_sip.c +++ /dev/null @@ -1,520 +0,0 @@ -/* SIP extension for IP connection tracking. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_conntrack_ftp.c and other modules. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/ctype.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_sip.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP connection tracking helper"); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of sip servers"); - -static unsigned int sip_timeout = SIP_TIMEOUT; -module_param(sip_timeout, uint, 0600); -MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); - -unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr); -EXPORT_SYMBOL_GPL(ip_nat_sip_hook); - -unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp, - const char *dptr); -EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); - -static int digits_len(const char *dptr, const char *limit, int *shift); -static int epaddr_len(const char *dptr, const char *limit, int *shift); -static int skp_digits_len(const char *dptr, const char *limit, int *shift); -static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); - -struct sip_header_nfo { - const char *lname; - const char *sname; - const char *ln_str; - size_t lnlen; - size_t snlen; - size_t ln_strlen; - int case_sensitive; - int (*match_len)(const char *, const char *, int *); -}; - -static struct sip_header_nfo ct_sip_hdrs[] = { - [POS_REG_REQ_URI] = { /* SIP REGISTER request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = epaddr_len - }, - [POS_REQ_URI] = { /* SIP request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = "@", - .ln_strlen = sizeof("@") - 1, - .match_len = epaddr_len - }, - [POS_FROM] = { /* SIP From header */ - .lname = "From:", - .lnlen = sizeof("From:") - 1, - .sname = "\r\nf:", - .snlen = sizeof("\r\nf:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len, - }, - [POS_TO] = { /* SIP To header */ - .lname = "To:", - .lnlen = sizeof("To:") - 1, - .sname = "\r\nt:", - .snlen = sizeof("\r\nt:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len, - }, - [POS_VIA] = { /* SIP Via header */ - .lname = "Via:", - .lnlen = sizeof("Via:") - 1, - .sname = "\r\nv:", - .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ - .ln_str = "UDP ", - .ln_strlen = sizeof("UDP ") - 1, - .match_len = epaddr_len, - }, - [POS_CONTACT] = { /* SIP Contact header */ - .lname = "Contact:", - .lnlen = sizeof("Contact:") - 1, - .sname = "\r\nm:", - .snlen = sizeof("\r\nm:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len - }, - [POS_CONTENT] = { /* SIP Content length header */ - .lname = "Content-Length:", - .lnlen = sizeof("Content-Length:") - 1, - .sname = "\r\nl:", - .snlen = sizeof("\r\nl:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = skp_digits_len - }, - [POS_MEDIA] = { /* SDP media info */ - .case_sensitive = 1, - .lname = "\nm=", - .lnlen = sizeof("\nm=") - 1, - .sname = "\rm=", - .snlen = sizeof("\rm=") - 1, - .ln_str = "audio ", - .ln_strlen = sizeof("audio ") - 1, - .match_len = digits_len - }, - [POS_OWNER] = { /* SDP owner address*/ - .case_sensitive = 1, - .lname = "\no=", - .lnlen = sizeof("\no=") - 1, - .sname = "\ro=", - .snlen = sizeof("\ro=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_CONNECTION] = { /* SDP connection info */ - .case_sensitive = 1, - .lname = "\nc=", - .lnlen = sizeof("\nc=") - 1, - .sname = "\rc=", - .snlen = sizeof("\rc=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_SDP_HEADER] = { /* SDP version header */ - .case_sensitive = 1, - .lname = "\nv=", - .lnlen = sizeof("\nv=") - 1, - .sname = "\rv=", - .snlen = sizeof("\rv=") - 1, - .ln_str = "=", - .ln_strlen = sizeof("=") - 1, - .match_len = digits_len - } -}; - -/* get line lenght until first CR or LF seen. */ -int ct_sip_lnlen(const char *line, const char *limit) -{ - const char *k = line; - - while ((line <= limit) && (*line == '\r' || *line == '\n')) - line++; - - while (line <= limit) { - if (*line == '\r' || *line == '\n') - break; - line++; - } - return line - k; -} -EXPORT_SYMBOL_GPL(ct_sip_lnlen); - -/* Linear string search, case sensitive. */ -const char *ct_sip_search(const char *needle, const char *haystack, - size_t needle_len, size_t haystack_len, - int case_sensitive) -{ - const char *limit = haystack + (haystack_len - needle_len); - - while (haystack <= limit) { - if (case_sensitive) { - if (strncmp(haystack, needle, needle_len) == 0) - return haystack; - } else { - if (strnicmp(haystack, needle, needle_len) == 0) - return haystack; - } - haystack++; - } - return NULL; -} -EXPORT_SYMBOL_GPL(ct_sip_search); - -static int digits_len(const char *dptr, const char *limit, int *shift) -{ - int len = 0; - while (dptr <= limit && isdigit(*dptr)) { - dptr++; - len++; - } - return len; -} - -/* get digits lenght, skiping blank spaces. */ -static int skp_digits_len(const char *dptr, const char *limit, int *shift) -{ - for (; dptr <= limit && *dptr == ' '; dptr++) - (*shift)++; - - return digits_len(dptr, limit, shift); -} - -/* Simple ipaddr parser.. */ -static int parse_ipaddr(const char *cp, const char **endp, - __be32 *ipaddr, const char *limit) -{ - unsigned long int val; - int i, digit = 0; - - for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { - digit = 0; - if (!isdigit(*cp)) - break; - - val = simple_strtoul(cp, (char **)&cp, 10); - if (val > 0xFF) - return -1; - - ((u_int8_t *)ipaddr)[i] = val; - digit = 1; - - if (*cp != '.') - break; - cp++; - } - if (!digit) - return -1; - - if (endp) - *endp = cp; - - return 0; -} - -/* skip ip address. returns it lenght. */ -static int epaddr_len(const char *dptr, const char *limit, int *shift) -{ - const char *aux = dptr; - __be32 ip; - - if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { - DEBUGP("ip: %s parse failed.!\n", dptr); - return 0; - } - - /* Port number */ - if (*dptr == ':') { - dptr++; - dptr += digits_len(dptr, limit, shift); - } - return dptr - aux; -} - -/* get address length, skiping user info. */ -static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) -{ - int s = *shift; - - /* Search for @, but stop at the end of the line. - * We are inside a sip: URI, so we don't need to worry about - * continuation lines. */ - while (dptr <= limit && - *dptr != '@' && *dptr != '\r' && *dptr != '\n') { - (*shift)++; - dptr++; - } - - if (dptr <= limit && *dptr == '@') { - dptr++; - (*shift)++; - } else - *shift = s; - - return epaddr_len(dptr, limit, shift); -} - -/* Returns 0 if not found, -1 error parsing. */ -int ct_sip_get_info(const char *dptr, size_t dlen, - unsigned int *matchoff, - unsigned int *matchlen, - enum sip_header_pos pos) -{ - struct sip_header_nfo *hnfo = &ct_sip_hdrs[pos]; - const char *limit, *aux, *k = dptr; - int shift = 0; - - limit = dptr + (dlen - hnfo->lnlen); - - while (dptr <= limit) { - if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && - (hnfo->sname == NULL || - strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { - dptr++; - continue; - } - aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, - ct_sip_lnlen(dptr, limit), - hnfo->case_sensitive); - if (!aux) { - DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, - hnfo->lname); - return -1; - } - aux += hnfo->ln_strlen; - - *matchlen = hnfo->match_len(aux, limit, &shift); - if (!*matchlen) - return -1; - - *matchoff = (aux - k) + shift; - - DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, - *matchlen); - return 1; - } - DEBUGP("%s header not found.\n", hnfo->lname); - return 0; -} -EXPORT_SYMBOL_GPL(ct_sip_get_info); - -static int set_expected_rtp(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - __be32 ipaddr, u_int16_t port, - const char *dptr) -{ - struct ip_conntrack_expect *exp; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - int ret; - typeof(ip_nat_sdp_hook) ip_nat_sdp; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - return NF_DROP; - - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.udp.port = 0; - exp->tuple.dst.ip = ipaddr; - exp->tuple.dst.u.udp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_UDP; - - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.udp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.udp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - - exp->expectfn = NULL; - exp->flags = 0; - - ip_nat_sdp = rcu_dereference(ip_nat_sdp_hook); - if (ip_nat_sdp) - ret = ip_nat_sdp(pskb, ctinfo, exp, dptr); - else { - if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - else - ret = NF_ACCEPT; - } - ip_conntrack_expect_put(exp); - - return ret; -} - -static int sip_help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff, datalen; - const char *dptr; - int ret = NF_ACCEPT; - int matchoff, matchlen; - __be32 ipaddr; - u_int16_t port; - typeof(ip_nat_sip_hook) ip_nat_sip; - - /* No Data ? */ - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) { - DEBUGP("skb->len = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - - ip_ct_refresh(ct, *pskb, sip_timeout * HZ); - - if (!skb_is_nonlinear(*pskb)) - dptr = (*pskb)->data + dataoff; - else { - DEBUGP("Copy of skbuff not supported yet.\n"); - goto out; - } - - ip_nat_sip = rcu_dereference(ip_nat_sip_hook); - if (ip_nat_sip) { - if (!ip_nat_sip(pskb, ctinfo, ct, &dptr)) { - ret = NF_DROP; - goto out; - } - } - - /* After this point NAT, could have mangled skb, so - we need to recalculate payload lenght. */ - datalen = (*pskb)->len - dataoff; - - if (datalen < (sizeof("SIP/2.0 200") - 1)) - goto out; - - /* RTP info only in some SDP pkts */ - if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && - memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { - goto out; - } - /* Get ip and port address from SDP packet. */ - if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, - POS_CONNECTION) > 0) { - - /* We'll drop only if there are parse problems. */ - if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, - dptr + datalen) < 0) { - ret = NF_DROP; - goto out; - } - if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, - POS_MEDIA) > 0) { - - port = simple_strtoul(dptr + matchoff, NULL, 10); - if (port < 1024) { - ret = NF_DROP; - goto out; - } - ret = set_expected_rtp(pskb, ct, ctinfo, - ipaddr, port, dptr); - } - } -out: - return ret; -} - -static struct ip_conntrack_helper sip[MAX_PORTS]; -static char sip_names[MAX_PORTS][10]; - -static void fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("unregistering helper for port %d\n", ports[i]); - ip_conntrack_helper_unregister(&sip[i]); - } -} - -static int __init init(void) -{ - int i, ret; - char *tmpname; - - if (ports_c == 0) - ports[ports_c++] = SIP_PORT; - - for (i = 0; i < ports_c; i++) { - /* Create helper structure */ - memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); - - sip[i].tuple.dst.protonum = IPPROTO_UDP; - sip[i].tuple.src.u.udp.port = htons(ports[i]); - sip[i].mask.src.u.udp.port = htons(0xFFFF); - sip[i].mask.dst.protonum = 0xFF; - sip[i].max_expected = 2; - sip[i].timeout = 3 * 60; /* 3 minutes */ - sip[i].me = THIS_MODULE; - sip[i].help = sip_help; - - tmpname = &sip_names[i][0]; - if (ports[i] == SIP_PORT) - sprintf(tmpname, "sip"); - else - sprintf(tmpname, "sip-%d", i); - sip[i].name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret = ip_conntrack_helper_register(&sip[i]); - if (ret) { - printk("ERROR registering helper for port %d\n", - ports[i]); - fini(); - return ret; - } - } - return 0; -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c deleted file mode 100644 index 56b2f7546d1..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ /dev/null @@ -1,962 +0,0 @@ -/* This file contains all the functions required for the standalone - ip_conntrack module. - - These are not required by the compatibility layer. -*/ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/percpu.h> -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> -#endif -#include <net/checksum.h> -#include <net/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_LICENSE("GPL"); - -extern atomic_t ip_conntrack_count; -DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - -static int kill_proto(struct ip_conntrack *i, void *data) -{ - return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == - *((u_int8_t *) data)); -} - -#ifdef CONFIG_PROC_FS -static int -print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *proto) -{ - seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", - NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); - return proto->print_tuple(s, tuple); -} - -#ifdef CONFIG_IP_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif - -struct ct_iter_state { - unsigned int bucket; -}; - -static struct list_head *ct_get_first(struct seq_file *seq) -{ - struct ct_iter_state *st = seq->private; - - for (st->bucket = 0; - st->bucket < ip_conntrack_htable_size; - st->bucket++) { - if (!list_empty(&ip_conntrack_hash[st->bucket])) - return ip_conntrack_hash[st->bucket].next; - } - return NULL; -} - -static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) -{ - struct ct_iter_state *st = seq->private; - - head = head->next; - while (head == &ip_conntrack_hash[st->bucket]) { - if (++st->bucket >= ip_conntrack_htable_size) - return NULL; - head = ip_conntrack_hash[st->bucket].next; - } - return head; -} - -static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct list_head *head = ct_get_first(seq); - - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *ct_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock_bh(&ip_conntrack_lock); - return ct_get_idx(seq, *pos); -} - -static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return ct_get_next(s, v); -} - -static void ct_seq_stop(struct seq_file *s, void *v) -{ - read_unlock_bh(&ip_conntrack_lock); -} - -static int ct_seq_show(struct seq_file *s, void *v) -{ - const struct ip_conntrack_tuple_hash *hash = v; - const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); - struct ip_conntrack_protocol *proto; - - IP_NF_ASSERT(conntrack); - - /* we only want to print DIR_ORIGINAL */ - if (DIRECTION(hash)) - return 0; - - proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - IP_NF_ASSERT(proto); - - if (seq_printf(s, "%-8s %u %ld ", - proto->name, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, - timer_pending(&conntrack->timeout) - ? (long)(conntrack->timeout.expires - jiffies)/HZ - : 0) != 0) - return -ENOSPC; - - if (proto->print_conntrack(s, conntrack)) - return -ENOSPC; - - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - proto)) - return -ENOSPC; - - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) - return -ENOSPC; - - if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) - if (seq_printf(s, "[UNREPLIED] ")) - return -ENOSPC; - - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, - proto)) - return -ENOSPC; - - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) - return -ENOSPC; - - if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) - if (seq_printf(s, "[ASSURED] ")) - return -ENOSPC; - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%u ", conntrack->mark)) - return -ENOSPC; -#endif - -#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", conntrack->secmark)) - return -ENOSPC; -#endif - - if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) - return -ENOSPC; - - return 0; -} - -static struct seq_operations ct_seq_ops = { - .start = ct_seq_start, - .next = ct_seq_next, - .stop = ct_seq_stop, - .show = ct_seq_show -}; - -static int ct_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - struct ct_iter_state *st; - int ret; - - st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &ct_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - memset(st, 0, sizeof(struct ct_iter_state)); - return ret; -out_free: - kfree(st); - return ret; -} - -static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, - .open = ct_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -/* expects */ -static void *exp_seq_start(struct seq_file *s, loff_t *pos) -{ - struct list_head *e = &ip_conntrack_expect_list; - loff_t i; - - /* strange seq_file api calls stop even if we fail, - * thus we need to grab lock since stop unlocks */ - read_lock_bh(&ip_conntrack_lock); - - if (list_empty(e)) - return NULL; - - for (i = 0; i <= *pos; i++) { - e = e->next; - if (e == &ip_conntrack_expect_list) - return NULL; - } - return e; -} - -static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct list_head *e = v; - - ++*pos; - e = e->next; - - if (e == &ip_conntrack_expect_list) - return NULL; - - return e; -} - -static void exp_seq_stop(struct seq_file *s, void *v) -{ - read_unlock_bh(&ip_conntrack_lock); -} - -static int exp_seq_show(struct seq_file *s, void *v) -{ - struct ip_conntrack_expect *expect = v; - - if (expect->timeout.function) - seq_printf(s, "%ld ", timer_pending(&expect->timeout) - ? (long)(expect->timeout.expires - jiffies)/HZ : 0); - else - seq_printf(s, "- "); - - seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); - - print_tuple(s, &expect->tuple, - __ip_conntrack_proto_find(expect->tuple.dst.protonum)); - return seq_putc(s, '\n'); -} - -static struct seq_operations exp_seq_ops = { - .start = exp_seq_start, - .next = exp_seq_next, - .stop = exp_seq_stop, - .show = exp_seq_show -}; - -static int exp_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &exp_seq_ops); -} - -static const struct file_operations exp_file_ops = { - .owner = THIS_MODULE, - .open = exp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; - -static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - int cpu; - - if (*pos == 0) - return SEQ_START_TOKEN; - - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return &per_cpu(ip_conntrack_stat, cpu); - } - - return NULL; -} - -static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - int cpu; - - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return &per_cpu(ip_conntrack_stat, cpu); - } - - return NULL; -} - -static void ct_cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int ct_cpu_seq_show(struct seq_file *seq, void *v) -{ - unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); - struct ip_conntrack_stat *st = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); - return 0; - } - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x \n", - nr_conntracks, - st->searched, - st->found, - st->new, - st->invalid, - st->ignore, - st->delete, - st->delete_list, - st->insert, - st->insert_failed, - st->drop, - st->early_drop, - st->error, - - st->expect_new, - st->expect_create, - st->expect_delete - ); - return 0; -} - -static struct seq_operations ct_cpu_seq_ops = { - .start = ct_cpu_seq_start, - .next = ct_cpu_seq_next, - .stop = ct_cpu_seq_stop, - .show = ct_cpu_seq_show, -}; - -static int ct_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ct_cpu_seq_ops); -} - -static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, - .open = ct_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -#endif - -static unsigned int ip_confirm(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* We've seen it coming out the other side: confirm it */ - return ip_conntrack_confirm(pskb); -} - -static unsigned int ip_conntrack_help(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - /* This is where we call the helper: as the packet goes out. */ - ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { - unsigned int ret; - ret = ct->helper->help(pskb, ct, ctinfo); - if (ret != NF_ACCEPT) - return ret; - } - return NF_ACCEPT; -} - -static unsigned int ip_conntrack_defrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) - /* Previously seen (loopback)? Ignore. Do this before - fragment check. */ - if ((*pskb)->nfct) - return NF_ACCEPT; -#endif - - /* Gather fragments. */ - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { - *pskb = ip_ct_gather_frags(*pskb, - hooknum == NF_IP_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT); - if (!*pskb) - return NF_STOLEN; - } - return NF_ACCEPT; -} - -static unsigned int ip_conntrack_local(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ipt_hook: happy cracking.\n"); - return NF_ACCEPT; - } - return ip_conntrack_in(hooknum, pskb, in, out, okfn); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static struct nf_hook_ops ip_conntrack_ops[] = { - { - .hook = ip_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { - .hook = ip_conntrack_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ip_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { - .hook = ip_conntrack_local, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ip_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ip_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ip_confirm, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ip_confirm, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Sysctl support */ - -int ip_conntrack_checksum __read_mostly = 1; - -#ifdef CONFIG_SYSCTL - -/* From ip_conntrack_core.c */ -extern int ip_conntrack_max; -extern unsigned int ip_conntrack_htable_size; - -/* From ip_conntrack_proto_tcp.c */ -extern unsigned int ip_ct_tcp_timeout_syn_sent; -extern unsigned int ip_ct_tcp_timeout_syn_recv; -extern unsigned int ip_ct_tcp_timeout_established; -extern unsigned int ip_ct_tcp_timeout_fin_wait; -extern unsigned int ip_ct_tcp_timeout_close_wait; -extern unsigned int ip_ct_tcp_timeout_last_ack; -extern unsigned int ip_ct_tcp_timeout_time_wait; -extern unsigned int ip_ct_tcp_timeout_close; -extern unsigned int ip_ct_tcp_timeout_max_retrans; -extern int ip_ct_tcp_loose; -extern int ip_ct_tcp_be_liberal; -extern int ip_ct_tcp_max_retrans; - -/* From ip_conntrack_proto_udp.c */ -extern unsigned int ip_ct_udp_timeout; -extern unsigned int ip_ct_udp_timeout_stream; - -/* From ip_conntrack_proto_icmp.c */ -extern unsigned int ip_ct_icmp_timeout; - -/* From ip_conntrack_proto_generic.c */ -extern unsigned int ip_ct_generic_timeout; - -/* Log invalid packets of a given protocol */ -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; - -static struct ctl_table_header *ip_ct_sysctl_header; - -static ctl_table ip_ct_sysctl_table[] = { - { - .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, - .procname = "ip_conntrack_max", - .data = &ip_conntrack_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, - .procname = "ip_conntrack_count", - .data = &ip_conntrack_count, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, - .procname = "ip_conntrack_buckets", - .data = &ip_conntrack_htable_size, - .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, - .procname = "ip_conntrack_checksum", - .data = &ip_conntrack_checksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, - .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &ip_ct_tcp_timeout_syn_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, - .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &ip_ct_tcp_timeout_syn_recv, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, - .procname = "ip_conntrack_tcp_timeout_established", - .data = &ip_ct_tcp_timeout_established, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, - .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &ip_ct_tcp_timeout_fin_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, - .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &ip_ct_tcp_timeout_close_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, - .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &ip_ct_tcp_timeout_last_ack, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, - .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &ip_ct_tcp_timeout_time_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, - .procname = "ip_conntrack_tcp_timeout_close", - .data = &ip_ct_tcp_timeout_close, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, - .procname = "ip_conntrack_udp_timeout", - .data = &ip_ct_udp_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, - .procname = "ip_conntrack_udp_timeout_stream", - .data = &ip_ct_udp_timeout_stream, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, - .procname = "ip_conntrack_icmp_timeout", - .data = &ip_ct_icmp_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, - .procname = "ip_conntrack_generic_timeout", - .data = &ip_ct_generic_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, - .procname = "ip_conntrack_log_invalid", - .data = &ip_ct_log_invalid, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, - .procname = "ip_conntrack_tcp_timeout_max_retrans", - .data = &ip_ct_tcp_timeout_max_retrans, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, - .procname = "ip_conntrack_tcp_loose", - .data = &ip_ct_tcp_loose, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, - .procname = "ip_conntrack_tcp_be_liberal", - .data = &ip_ct_tcp_be_liberal, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, - .procname = "ip_conntrack_tcp_max_retrans", - .data = &ip_ct_tcp_max_retrans, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -#define NET_IP_CONNTRACK_MAX 2089 - -static ctl_table ip_ct_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - .child = ip_ct_sysctl_table, - }, - { - .ctl_name = NET_IP_CONNTRACK_MAX, - .procname = "ip_conntrack_max", - .data = &ip_conntrack_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ip_ct_netfilter_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ip_ct_ipv4_table, - }, - { .ctl_name = 0 } -}; - -EXPORT_SYMBOL(ip_ct_log_invalid); -#endif /* CONFIG_SYSCTL */ - -/* FIXME: Allow NULL functions and sub in pointers to generic for - them. --RR */ -int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) -{ - int ret = 0; - - write_lock_bh(&ip_conntrack_lock); - if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { - ret = -EBUSY; - goto out; - } - rcu_assign_pointer(ip_ct_protos[proto->proto], proto); - out: - write_unlock_bh(&ip_conntrack_lock); - return ret; -} - -void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) -{ - write_lock_bh(&ip_conntrack_lock); - rcu_assign_pointer(ip_ct_protos[proto->proto], - &ip_conntrack_generic_protocol); - write_unlock_bh(&ip_conntrack_lock); - synchronize_rcu(); - - /* Remove all contrack entries for this protocol */ - ip_ct_iterate_cleanup(kill_proto, &proto->proto); -} - -static int __init ip_conntrack_standalone_init(void) -{ -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc, *proc_exp, *proc_stat; -#endif - int ret = 0; - - ret = ip_conntrack_init(); - if (ret < 0) - return ret; - -#ifdef CONFIG_PROC_FS - ret = -ENOMEM; - proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); - if (!proc) goto cleanup_init; - - proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, - &exp_file_ops); - if (!proc_exp) goto cleanup_proc; - - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); - if (!proc_stat) - goto cleanup_proc_exp; - - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; -#endif - - ret = nf_register_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); - if (ret < 0) { - printk("ip_conntrack: can't register hooks.\n"); - goto cleanup_proc_stat; - } -#ifdef CONFIG_SYSCTL - ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table); - if (ip_ct_sysctl_header == NULL) { - printk("ip_conntrack: can't register to sysctl.\n"); - ret = -ENOMEM; - goto cleanup_hooks; - } -#endif - return ret; - -#ifdef CONFIG_SYSCTL - cleanup_hooks: - nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); -#endif - cleanup_proc_stat: -#ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); - cleanup_proc_exp: - proc_net_remove("ip_conntrack_expect"); - cleanup_proc: - proc_net_remove("ip_conntrack"); - cleanup_init: -#endif /* CONFIG_PROC_FS */ - ip_conntrack_cleanup(); - return ret; -} - -static void __exit ip_conntrack_standalone_fini(void) -{ - synchronize_net(); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ip_ct_sysctl_header); -#endif - nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); - proc_net_remove("ip_conntrack_expect"); - proc_net_remove("ip_conntrack"); -#endif /* CONFIG_PROC_FS */ - ip_conntrack_cleanup(); -} - -module_init(ip_conntrack_standalone_init); -module_exit(ip_conntrack_standalone_fini); - -/* Some modules need us, but don't depend directly on any symbol. - They should call this. */ -void need_conntrack(void) -{ -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -EXPORT_SYMBOL_GPL(ip_conntrack_chain); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); -EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); -EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); -EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); -EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); -#endif -EXPORT_SYMBOL(ip_conntrack_protocol_register); -EXPORT_SYMBOL(ip_conntrack_protocol_unregister); -EXPORT_SYMBOL(ip_ct_get_tuple); -EXPORT_SYMBOL(invert_tuplepr); -EXPORT_SYMBOL(ip_conntrack_alter_reply); -EXPORT_SYMBOL(ip_conntrack_destroyed); -EXPORT_SYMBOL(need_conntrack); -EXPORT_SYMBOL(ip_conntrack_helper_register); -EXPORT_SYMBOL(ip_conntrack_helper_unregister); -EXPORT_SYMBOL(ip_ct_iterate_cleanup); -EXPORT_SYMBOL(__ip_ct_refresh_acct); - -EXPORT_SYMBOL(ip_conntrack_expect_alloc); -EXPORT_SYMBOL(ip_conntrack_expect_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); -EXPORT_SYMBOL(ip_conntrack_expect_related); -EXPORT_SYMBOL(ip_conntrack_unexpect_related); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); -EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); - -EXPORT_SYMBOL(ip_conntrack_tuple_taken); -EXPORT_SYMBOL(ip_ct_gather_frags); -EXPORT_SYMBOL(ip_conntrack_htable_size); -EXPORT_SYMBOL(ip_conntrack_lock); -EXPORT_SYMBOL(ip_conntrack_hash); -EXPORT_SYMBOL(ip_conntrack_untracked); -EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -#ifdef CONFIG_IP_NF_NAT_NEEDED -EXPORT_SYMBOL(ip_conntrack_tcp_update); -#endif - -EXPORT_SYMBOL_GPL(ip_conntrack_flush); -EXPORT_SYMBOL_GPL(__ip_conntrack_find); - -EXPORT_SYMBOL_GPL(ip_conntrack_alloc); -EXPORT_SYMBOL_GPL(ip_conntrack_free); -EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); - -EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); - -EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); - -EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); -EXPORT_SYMBOL_GPL(ip_conntrack_checksum); -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); -EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); -#endif diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c deleted file mode 100644 index 76e175e7a97..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ /dev/null @@ -1,161 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Version: 0.0.7 - * - * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> - * - port to newnat API - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> -#include <linux/moduleparam.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("tftp connection tracking helper"); -MODULE_LICENSE("GPL"); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of tftp servers"); - -#if 0 -#define DEBUGP(format, args...) printk("%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - -unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_tftp_hook); - -static int tftp_help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct tftphdr _tftph, *tfh; - struct ip_conntrack_expect *exp; - unsigned int ret = NF_ACCEPT; - typeof(ip_nat_tftp_hook) ip_nat_tftp; - - tfh = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), - sizeof(_tftph), &_tftph); - if (tfh == NULL) - return NF_ACCEPT; - - switch (ntohs(tfh->opcode)) { - /* RRQ and WRQ works the same way */ - case TFTP_OPCODE_READ: - case TFTP_OPCODE_WRITE: - DEBUGP(""); - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - return NF_DROP; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->mask.src.ip = htonl(0xffffffff); - exp->mask.src.u.udp.port = 0; - exp->mask.dst.ip = htonl(0xffffffff); - exp->mask.dst.u.udp.port = htons(0xffff); - exp->mask.dst.protonum = 0xff; - exp->expectfn = NULL; - exp->flags = 0; - - DEBUGP("expect: "); - DUMP_TUPLE(&exp->tuple); - DUMP_TUPLE(&exp->mask); - ip_nat_tftp = rcu_dereference(ip_nat_tftp_hook); - if (ip_nat_tftp) - ret = ip_nat_tftp(pskb, ctinfo, exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - break; - case TFTP_OPCODE_DATA: - case TFTP_OPCODE_ACK: - DEBUGP("Data/ACK opcode\n"); - break; - case TFTP_OPCODE_ERROR: - DEBUGP("Error opcode\n"); - break; - default: - DEBUGP("Unknown opcode\n"); - } - return NF_ACCEPT; -} - -static struct ip_conntrack_helper tftp[MAX_PORTS]; -static char tftp_names[MAX_PORTS][sizeof("tftp-65535")]; - -static void ip_conntrack_tftp_fini(void) -{ - int i; - - for (i = 0 ; i < ports_c; i++) { - DEBUGP("unregistering helper for port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&tftp[i]); - } -} - -static int __init ip_conntrack_tftp_init(void) -{ - int i, ret; - char *tmpname; - - if (ports_c == 0) - ports[ports_c++] = TFTP_PORT; - - for (i = 0; i < ports_c; i++) { - /* Create helper structure */ - memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); - - tftp[i].tuple.dst.protonum = IPPROTO_UDP; - tftp[i].tuple.src.u.udp.port = htons(ports[i]); - tftp[i].mask.dst.protonum = 0xFF; - tftp[i].mask.src.u.udp.port = htons(0xFFFF); - tftp[i].max_expected = 1; - tftp[i].timeout = 5 * 60; /* 5 minutes */ - tftp[i].me = THIS_MODULE; - tftp[i].help = tftp_help; - - tmpname = &tftp_names[i][0]; - if (ports[i] == TFTP_PORT) - sprintf(tmpname, "tftp"); - else - sprintf(tmpname, "tftp-%d", i); - tftp[i].name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret=ip_conntrack_helper_register(&tftp[i]); - if (ret) { - printk("ERROR registering helper for port %d\n", - ports[i]); - ip_conntrack_tftp_fini(); - return(ret); - } - } - return(0); -} - -module_init(ip_conntrack_tftp_init); -module_exit(ip_conntrack_tftp_fini); diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c deleted file mode 100644 index 85df1a9aed3..00000000000 --- a/net/ipv4/netfilter/ip_nat_amanda.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Amanda extension for TCP NAT alteration. - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on a copy of HW's ip_nat_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Module load syntax: - * insmod ip_nat_amanda.o - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/tcp.h> -#include <net/udp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> - - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp) -{ - char buffer[sizeof("65535")]; - u_int16_t port; - unsigned int ret; - - /* Connection comes from client. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_ORIGINAL; - - /* When you see the packet, we need to NAT it the same as the - * this one (ie. same IP: it will be TCP and master is UDP). */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - sprintf(buffer, "%u", port); - ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, - matchoff, matchlen, - buffer, strlen(buffer)); - if (ret != NF_ACCEPT) - ip_conntrack_unexpect_related(exp); - return ret; -} - -static void __exit ip_nat_amanda_fini(void) -{ - rcu_assign_pointer(ip_nat_amanda_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_amanda_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_amanda_hook)); - rcu_assign_pointer(ip_nat_amanda_hook, help); - return 0; -} - -module_init(ip_nat_amanda_init); -module_exit(ip_nat_amanda_fini); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c deleted file mode 100644 index 40737fdbe9a..00000000000 --- a/net/ipv4/netfilter/ip_nat_core.c +++ /dev/null @@ -1,634 +0,0 @@ -/* NAT for netfilter; shared with compatibility layer. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv4.h> -#include <linux/vmalloc.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> /* For tcp_prot in getorigdst */ -#include <linux/icmp.h> -#include <linux/udp.h> -#include <linux/jhash.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -DEFINE_RWLOCK(ip_nat_lock); - -/* Calculated at init based on memory size */ -static unsigned int ip_nat_htable_size; - -static struct list_head *bysource; - -#define MAX_IP_NAT_PROTO 256 -static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; - -static inline struct ip_nat_protocol * -__ip_nat_proto_find(u_int8_t protonum) -{ - return rcu_dereference(ip_nat_protos[protonum]); -} - -struct ip_nat_protocol * -ip_nat_proto_find_get(u_int8_t protonum) -{ - struct ip_nat_protocol *p; - - rcu_read_lock(); - p = __ip_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); - -void -ip_nat_proto_put(struct ip_nat_protocol *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(ip_nat_proto_put); - -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct ip_conntrack_tuple *tuple) -{ - /* Original src, to ensure we map it consistently if poss. */ - return jhash_3words((__force u32)tuple->src.ip, tuple->src.u.all, - tuple->dst.protonum, 0) % ip_nat_htable_size; -} - -/* Noone using conntrack by the time this called. */ -static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) -{ - if (!(conn->status & IPS_NAT_DONE_MASK)) - return; - - write_lock_bh(&ip_nat_lock); - list_del(&conn->nat.info.bysource); - write_unlock_bh(&ip_nat_lock); -} - -/* Is this tuple already taken? (not by us) */ -int -ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - /* Conntrack tracking doesn't keep track of outgoing tuples; only - incoming ones. NAT means they don't have a fixed mapping, - so we invert the tuple and look for the incoming reply. - - We could keep a separate hash if this proves too slow. */ - struct ip_conntrack_tuple reply; - - invert_tuplepr(&reply, tuple); - return ip_conntrack_tuple_taken(&reply, ignored_conntrack); -} -EXPORT_SYMBOL(ip_nat_used_tuple); - -/* If we source map this tuple so reply looks like reply_tuple, will - * that meet the constraints of range. */ -static int -in_range(const struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range) -{ - struct ip_nat_protocol *proto; - int ret = 0; - - /* If we are supposed to map IPs, then we must be in the - range specified, otherwise let this drag us onto a new src IP. */ - if (range->flags & IP_NAT_RANGE_MAP_IPS) { - if (ntohl(tuple->src.ip) < ntohl(range->min_ip) - || ntohl(tuple->src.ip) > ntohl(range->max_ip)) - return 0; - } - - rcu_read_lock(); - proto = __ip_nat_proto_find(tuple->dst.protonum); - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) - || proto->in_range(tuple, IP_NAT_MANIP_SRC, - &range->min, &range->max)) - ret = 1; - rcu_read_unlock(); - - return ret; -} - -static inline int -same_src(const struct ip_conntrack *ct, - const struct ip_conntrack_tuple *tuple) -{ - return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum - == tuple->dst.protonum - && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip - == tuple->src.ip - && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all - == tuple->src.u.all); -} - -/* Only called for SRC manip */ -static int -find_appropriate_src(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_tuple *result, - const struct ip_nat_range *range) -{ - unsigned int h = hash_by_src(tuple); - struct ip_conntrack *ct; - - read_lock_bh(&ip_nat_lock); - list_for_each_entry(ct, &bysource[h], nat.info.bysource) { - if (same_src(ct, tuple)) { - /* Copy source part from reply tuple. */ - invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(result, range)) { - read_unlock_bh(&ip_nat_lock); - return 1; - } - } - } - read_unlock_bh(&ip_nat_lock); - return 0; -} - -/* For [FUTURE] fragmentation handling, we want the least-used - src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus - if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports - 1-65535, we don't do pro-rata allocation based on ports; we choose - the ip with the lowest src-ip/dst-ip/proto usage. -*/ -static void -find_best_ips_proto(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - const struct ip_conntrack *conntrack, - enum ip_nat_manip_type maniptype) -{ - __be32 *var_ipp; - /* Host order */ - u_int32_t minip, maxip, j; - - /* No IP mapping? Do nothing. */ - if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) - return; - - if (maniptype == IP_NAT_MANIP_SRC) - var_ipp = &tuple->src.ip; - else - var_ipp = &tuple->dst.ip; - - /* Fast path: only one choice. */ - if (range->min_ip == range->max_ip) { - *var_ipp = range->min_ip; - return; - } - - /* Hashing source and destination IPs gives a fairly even - * spread in practice (if there are a small number of IPs - * involved, there usually aren't that many connections - * anyway). The consistency means that servers see the same - * client coming from the same IP (some Internet Banking sites - * like this), even across reboots. */ - minip = ntohl(range->min_ip); - maxip = ntohl(range->max_ip); - j = jhash_2words((__force u32)tuple->src.ip, (__force u32)tuple->dst.ip, 0); - *var_ipp = htonl(minip + j % (maxip - minip + 1)); -} - -/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, - * we change the source to map into the range. For NF_IP_PRE_ROUTING - * and NF_IP_LOCAL_OUT, we change the destination to map into the - * range. It might not be possible to get a unique tuple, but we try. - * At worst (or if we race), we will end up with a final duplicate in - * __ip_conntrack_confirm and drop the packet. */ -static void -get_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig_tuple, - const struct ip_nat_range *range, - struct ip_conntrack *conntrack, - enum ip_nat_manip_type maniptype) -{ - struct ip_nat_protocol *proto; - - /* 1) If this srcip/proto/src-proto-part is currently mapped, - and that same mapping gives a unique tuple within the given - range, use that. - - This is only required for source (ie. NAT/masq) mappings. - So far, we don't do local source mappings, so multiple - manips not an issue. */ - if (maniptype == IP_NAT_MANIP_SRC) { - if (find_appropriate_src(orig_tuple, tuple, range)) { - DEBUGP("get_unique_tuple: Found current src map\n"); - if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) - if (!ip_nat_used_tuple(tuple, conntrack)) - return; - } - } - - /* 2) Select the least-used IP/proto combination in the given - range. */ - *tuple = *orig_tuple; - find_best_ips_proto(tuple, range, conntrack, maniptype); - - /* 3) The per-protocol part of the manip is made to map into - the range to make a unique tuple. */ - - rcu_read_lock(); - proto = __ip_nat_proto_find(orig_tuple->dst.protonum); - - /* Change protocol info to have some randomization */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) { - proto->unique_tuple(tuple, range, maniptype, conntrack); - goto out; - } - - /* Only bother mapping if it's not already in range and unique */ - if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) - || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) - goto out; - - /* Last change: get protocol to try to obtain unique tuple. */ - proto->unique_tuple(tuple, range, maniptype, conntrack); -out: - rcu_read_unlock(); -} - -unsigned int -ip_nat_setup_info(struct ip_conntrack *conntrack, - const struct ip_nat_range *range, - unsigned int hooknum) -{ - struct ip_conntrack_tuple curr_tuple, new_tuple; - struct ip_nat_info *info = &conntrack->nat.info; - int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); - enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); - - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING - || hooknum == NF_IP_LOCAL_IN - || hooknum == NF_IP_LOCAL_OUT); - BUG_ON(ip_nat_initialized(conntrack, maniptype)); - - /* What we've got will look like inverse of reply. Normally - this is what is in the conntrack, except for prior - manipulations (future optimization: if num_manips == 0, - orig_tp = - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ - invert_tuplepr(&curr_tuple, - &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); - - get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype); - - if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) { - struct ip_conntrack_tuple reply; - - /* Alter conntrack table so will recognize replies. */ - invert_tuplepr(&reply, &new_tuple); - ip_conntrack_alter_reply(conntrack, &reply); - - /* Non-atomic: we own this at the moment. */ - if (maniptype == IP_NAT_MANIP_SRC) - conntrack->status |= IPS_SRC_NAT; - else - conntrack->status |= IPS_DST_NAT; - } - - /* Place in source hash if this is the first time. */ - if (have_to_hash) { - unsigned int srchash - = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple); - write_lock_bh(&ip_nat_lock); - list_add(&info->bysource, &bysource[srchash]); - write_unlock_bh(&ip_nat_lock); - } - - /* It's done. */ - if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status); - else - set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status); - - return NF_ACCEPT; -} -EXPORT_SYMBOL(ip_nat_setup_info); - -/* Returns true if succeeded. */ -static int -manip_pkt(u_int16_t proto, - struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *target, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph; - struct ip_nat_protocol *p; - - if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) - return 0; - - iph = (void *)(*pskb)->data + iphdroff; - - /* Manipulate protcol part. */ - - /* rcu_read_lock()ed by nf_hook_slow */ - p = __ip_nat_proto_find(proto); - if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) - return 0; - - iph = (void *)(*pskb)->data + iphdroff; - - if (maniptype == IP_NAT_MANIP_SRC) { - nf_csum_replace4(&iph->check, iph->saddr, target->src.ip); - iph->saddr = target->src.ip; - } else { - nf_csum_replace4(&iph->check, iph->daddr, target->dst.ip); - iph->daddr = target->dst.ip; - } - return 1; -} - -/* Do packet manipulations according to ip_nat_setup_info. */ -unsigned int ip_nat_packet(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff **pskb) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum); - - if (mtype == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - /* Non-atomic: these bits don't change. */ - if (ct->status & statusbit) { - struct ip_conntrack_tuple target; - - /* We are aiming to look like inverse of other direction. */ - invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - - if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) - return NF_DROP; - } - return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(ip_nat_packet); - -/* Dir is direction ICMP is coming from (opposite to packet it contains) */ -int ip_nat_icmp_reply_translation(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff **pskb) -{ - struct { - struct icmphdr icmp; - struct iphdr ip; - } *inside; - struct ip_conntrack_protocol *proto; - struct ip_conntrack_tuple inner, target; - int hdrlen = (*pskb)->nh.iph->ihl * 4; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum ip_nat_manip_type manip = HOOK2MANIP(hooknum); - - if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) - return 0; - - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - - /* We're actually going to mangle it beyond trivial checksum - adjustment, so make sure the current checksum is correct. */ - if (nf_ip_checksum(*pskb, hooknum, hdrlen, 0)) - return 0; - - /* Must be RELATED */ - IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED || - (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); - - /* Redirects on non-null nats must be dropped, else they'll - start talking to each other without our translation, and be - confused... --RR */ - if (inside->icmp.type == ICMP_REDIRECT) { - /* If NAT isn't finished, assume it and drop. */ - if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) - return 0; - - if (ct->status & IPS_NAT_MASK) - return 0; - } - - DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n", - *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - - /* rcu_read_lock()ed by nf_hook_slow */ - proto = __ip_conntrack_proto_find(inside->ip.protocol); - if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + - sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, proto)) - return 0; - - /* Change inner back to look like incoming packet. We do the - opposite manip on this hook to normal, because it might not - pass all hooks (locally-generated ICMP). Consider incoming - packet: PREROUTING (DST manip), routing produces ICMP, goes - through POSTROUTING (which must correct the DST manip). */ - if (!manip_pkt(inside->ip.protocol, pskb, - (*pskb)->nh.iph->ihl*4 - + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, - !manip)) - return 0; - - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - inside->icmp.checksum = 0; - inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, - (*pskb)->len - hdrlen, - 0)); - } - - /* Change outer to look the reply to an incoming packet - * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, pskb, 0, &target, manip)) - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation); - -/* Protocol registration. */ -int ip_nat_protocol_register(struct ip_nat_protocol *proto) -{ - int ret = 0; - - write_lock_bh(&ip_nat_lock); - if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { - ret = -EBUSY; - goto out; - } - rcu_assign_pointer(ip_nat_protos[proto->protonum], proto); - out: - write_unlock_bh(&ip_nat_lock); - return ret; -} -EXPORT_SYMBOL(ip_nat_protocol_register); - -/* Noone stores the protocol anywhere; simply delete it. */ -void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) -{ - write_lock_bh(&ip_nat_lock); - rcu_assign_pointer(ip_nat_protos[proto->protonum], - &ip_nat_unknown_protocol); - write_unlock_bh(&ip_nat_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL(ip_nat_protocol_unregister); - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -int -ip_nat_port_range_to_nfattr(struct sk_buff *skb, - const struct ip_nat_range *range) -{ - NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), - &range->min.tcp.port); - NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), - &range->max.tcp.port); - - return 0; - -nfattr_failure: - return -1; -} - -int -ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) -{ - int ret = 0; - - /* we have to return whether we actually parsed something or not */ - - if (tb[CTA_PROTONAT_PORT_MIN-1]) { - ret = 1; - range->min.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); - } - - if (!tb[CTA_PROTONAT_PORT_MAX-1]) { - if (ret) - range->max.tcp.port = range->min.tcp.port; - } else { - ret = 1; - range->max.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); - } - - return ret; -} -EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range); -EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); -#endif - -static int __init ip_nat_init(void) -{ - size_t i; - - /* Leave them the same for the moment. */ - ip_nat_htable_size = ip_conntrack_htable_size; - - /* One vmalloc for both hash tables */ - bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); - if (!bysource) - return -ENOMEM; - - /* Sew in builtin protocols. */ - write_lock_bh(&ip_nat_lock); - for (i = 0; i < MAX_IP_NAT_PROTO; i++) - rcu_assign_pointer(ip_nat_protos[i], &ip_nat_unknown_protocol); - rcu_assign_pointer(ip_nat_protos[IPPROTO_TCP], &ip_nat_protocol_tcp); - rcu_assign_pointer(ip_nat_protos[IPPROTO_UDP], &ip_nat_protocol_udp); - rcu_assign_pointer(ip_nat_protos[IPPROTO_ICMP], &ip_nat_protocol_icmp); - write_unlock_bh(&ip_nat_lock); - - for (i = 0; i < ip_nat_htable_size; i++) { - INIT_LIST_HEAD(&bysource[i]); - } - - /* FIXME: Man, this is a hack. <SIGH> */ - IP_NF_ASSERT(rcu_dereference(ip_conntrack_destroyed) == NULL); - rcu_assign_pointer(ip_conntrack_destroyed, ip_nat_cleanup_conntrack); - - /* Initialize fake conntrack so that NAT will skip it */ - ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; - return 0; -} - -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct ip_conntrack *i, void *data) -{ - memset(&i->nat, 0, sizeof(i->nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); - return 0; -} - -static void __exit ip_nat_cleanup(void) -{ - ip_ct_iterate_cleanup(&clean_nat, NULL); - rcu_assign_pointer(ip_conntrack_destroyed, NULL); - synchronize_rcu(); - vfree(bysource); -} - -MODULE_LICENSE("GPL"); - -module_init(ip_nat_init); -module_exit(ip_nat_cleanup); diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c deleted file mode 100644 index 32e01d8dffc..00000000000 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ /dev/null @@ -1,180 +0,0 @@ -/* FTP extension for TCP NAT alteration. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/moduleparam.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp NAT helper"); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/* FIXME: Time out? --RR */ - -static int -mangle_rfc959_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; - - sprintf(buffer, "%u,%u,%u,%u,%u,%u", - NIPQUAD(newip), port>>8, port&0xFF); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_eprt_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("|1|255.255.255.255|65535|")]; - - sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_epsv_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("|||65535|")]; - - sprintf(buffer, "|||%u|", port); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -static int (*mangle[])(struct sk_buff **, __be32, u_int16_t, - unsigned int, - unsigned int, - struct ip_conntrack *, - enum ip_conntrack_info, - u32 *seq) -= { [IP_CT_FTP_PORT] = mangle_rfc959_packet, - [IP_CT_FTP_PASV] = mangle_rfc959_packet, - [IP_CT_FTP_EPRT] = mangle_eprt_packet, - [IP_CT_FTP_EPSV] = mangle_epsv_packet -}; - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_ftp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq) -{ - __be32 newip; - u_int16_t port; - int dir = CTINFO2DIR(ctinfo); - struct ip_conntrack *ct = exp->master; - - DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); - - /* Connection will come from wherever this packet goes, hence !dir */ - newip = ct->tuplehash[!dir].tuple.dst.ip; - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - * this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, - seq)) { - ip_conntrack_unexpect_related(exp); - return NF_DROP; - } - return NF_ACCEPT; -} - -static void __exit ip_nat_ftp_fini(void) -{ - rcu_assign_pointer(ip_nat_ftp_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_ftp_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_ftp_hook)); - rcu_assign_pointer(ip_nat_ftp_hook, ip_nat_ftp); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(ip_nat_ftp_init); -module_exit(ip_nat_ftp_fini); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c deleted file mode 100644 index dc778cfef58..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ /dev/null @@ -1,436 +0,0 @@ -/* ip_nat_helper.c - generic support functions for NAT helpers - * - * (C) 2000-2002 Harald Welte <laforge@netfilter.org> - * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>: - * - add support for SACK adjustment - * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>: - * - merge SACK support into newnat API - * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>: - * - make ip_nat_resize_packet more generic (TCP and UDP) - * - add ip_nat_mangle_udp_packet - */ -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv4.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/udp.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> - -#if 0 -#define DEBUGP printk -#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos); -#else -#define DEBUGP(format, args...) -#define DUMP_OFFSET(x) -#endif - -static DEFINE_SPINLOCK(ip_nat_seqofs_lock); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - int dir; - struct ip_nat_seq *this_way, *other_way; - - DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n", - (*skb)->len, new_size); - - dir = CTINFO2DIR(ctinfo); - - this_way = &ct->nat.info.seq[dir]; - other_way = &ct->nat.info.seq[!dir]; - - DEBUGP("ip_nat_resize_packet: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&ip_nat_seqofs_lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after - || before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&ip_nat_seqofs_lock); - - DEBUGP("ip_nat_resize_packet: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Frobs data inside this packet, which is linear. */ -static void mangle_contents(struct sk_buff *skb, - unsigned int dataoff, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - unsigned char *data; - - BUG_ON(skb_is_nonlinear(skb)); - data = (unsigned char *)skb->nh.iph + dataoff; - - /* move post-replacement */ - memmove(data + match_offset + rep_len, - data + match_offset + match_len, - skb->tail - (data + match_offset + match_len)); - - /* insert data from buffer */ - memcpy(data + match_offset, rep_buffer, rep_len); - - /* update skb info */ - if (rep_len > match_len) { - DEBUGP("ip_nat_mangle_packet: Extending packet by " - "%u from %u bytes\n", rep_len - match_len, - skb->len); - skb_put(skb, rep_len - match_len); - } else { - DEBUGP("ip_nat_mangle_packet: Shrinking packet from " - "%u from %u bytes\n", match_len - rep_len, - skb->len); - __skb_trim(skb, skb->len + rep_len - match_len); - } - - /* fix IP hdr checksum information */ - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); -} - -/* Unusual, but possible case. */ -static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) -{ - struct sk_buff *nskb; - - if ((*pskb)->len + extra > 65535) - return 0; - - nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC); - if (!nskb) - return 0; - - /* Transfer socket to new skb. */ - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - return 1; -} - -/* Generic function for mangling variable-length address changes inside - * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX - * command in FTP). - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * */ -int -ip_nat_mangle_tcp_packet(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - struct iphdr *iph; - struct tcphdr *tcph; - int oldlen, datalen; - - if (!skb_make_writable(pskb, (*pskb)->len)) - return 0; - - if (rep_len > match_len - && rep_len - match_len > skb_tailroom(*pskb) - && !enlarge_skb(pskb, rep_len - match_len)) - return 0; - - SKB_LINEAR_ASSERT(*pskb); - - iph = (*pskb)->nh.iph; - tcph = (void *)iph + iph->ihl*4; - - oldlen = (*pskb)->len - iph->ihl*4; - mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4, - match_offset, match_len, rep_buffer, rep_len); - - datalen = (*pskb)->len - iph->ihl*4; - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial((char *)tcph, - datalen, 0)); - } else - nf_proto_csum_replace2(&tcph->check, *pskb, - htons(oldlen), htons(datalen), 1); - - if (rep_len != match_len) { - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(tcph->seq), - (int)rep_len - (int)match_len, - ct, ctinfo); - /* Tell TCP window tracking about seq change */ - ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo)); - } - return 1; -} -EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); - -/* Generic function for mangling variable-length address changes inside - * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX - * command in the Amanda protocol) - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * XXX - This function could be merged with ip_nat_mangle_tcp_packet which - * should be fairly easy to do. - */ -int -ip_nat_mangle_udp_packet(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - struct iphdr *iph; - struct udphdr *udph; - int datalen, oldlen; - - /* UDP helpers might accidentally mangle the wrong packet */ - iph = (*pskb)->nh.iph; - if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + - match_offset + match_len) - return 0; - - if (!skb_make_writable(pskb, (*pskb)->len)) - return 0; - - if (rep_len > match_len - && rep_len - match_len > skb_tailroom(*pskb) - && !enlarge_skb(pskb, rep_len - match_len)) - return 0; - - iph = (*pskb)->nh.iph; - udph = (void *)iph + iph->ihl*4; - - oldlen = (*pskb)->len - iph->ihl*4; - mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph), - match_offset, match_len, rep_buffer, rep_len); - - /* update the length of the UDP packet */ - datalen = (*pskb)->len - iph->ihl*4; - udph->len = htons(datalen); - - /* fix udp checksum if udp checksum was previously calculated */ - if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL) - return 1; - - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial((char *)udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } else - nf_proto_csum_replace2(&udph->check, *pskb, - htons(oldlen), htons(datalen), 1); - return 1; -} -EXPORT_SYMBOL(ip_nat_mangle_udp_packet); - -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct ip_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - nf_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - nf_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -ip_nat_sack_adjust(struct sk_buff **pskb, - struct tcphdr *tcph, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - - optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); - optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - - if (!skb_make_writable(pskb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = (*pskb)->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend - || optoff + op[1] > optend - || op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK - && op[1] >= 2+TCPOLEN_SACK_PERBLOCK - && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(*pskb, tcph, optoff+2, - optoff+op[1], - &ct->nat.info.seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -ip_nat_seq_adjust(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - struct ip_nat_seq *this_way, *other_way; - - dir = CTINFO2DIR(ctinfo); - - this_way = &ct->nat.info.seq[dir]; - other_way = &ct->nat.info.seq[!dir]; - - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) - return 0; - - tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - if (after(ntohl(tcph->seq), this_way->correction_pos)) - newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); - else - newseq = htonl(ntohl(tcph->seq) + this_way->offset_before); - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after); - else - newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before); - - nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0); - nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0); - - DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo)) - return 0; - - ip_conntrack_tcp_update(*pskb, ct, dir); - - return 1; -} -EXPORT_SYMBOL(ip_nat_seq_adjust); - -/* Setup NAT on this expected conntrack so it follows master. */ -/* If we fail to get a free NAT slot, we'll get dropped on confirm */ -void ip_nat_follow_master(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - struct ip_nat_range range; - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.ip; - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = exp->saved_proto; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.ip; - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); -} -EXPORT_SYMBOL(ip_nat_follow_master); diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c deleted file mode 100644 index bdc99ef6159..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ /dev/null @@ -1,611 +0,0 @@ -/* - * H.323 extension for NAT alteration. - * - * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> - * - * This source code is licensed under General Public License version 2. - * - * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/moduleparam.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> -#include <linux/netfilter_ipv4/ip_conntrack_h323.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/****************************************************************************/ -static int set_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - unsigned int addroff, __be32 ip, u_int16_t port) -{ - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get(*pskb, &ctinfo); - struct { - __be32 ip; - __be16 port; - } __attribute__ ((__packed__)) buf; - struct tcphdr _tcph, *th; - - buf.ip = ip; - buf.port = htons(port); - addroff += dataoff; - - if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) { - if (!ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - addroff, sizeof(buf), - (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - printk("ip_nat_h323: ip_nat_mangle_tcp_packet" - " error\n"); - return -1; - } - - /* Relocate data pointer */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return -1; - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - th->doff * 4 + dataoff; - } else { - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - addroff, sizeof(buf), - (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - printk("ip_nat_h323: ip_nat_mangle_udp_packet" - " error\n"); - return -1; - } - /* ip_nat_mangle_udp_packet uses skb_make_writable() to copy - * or pull everything in a linear buffer, so we can safely - * use the skb pointers now */ - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - sizeof(struct udphdr); - } - - return 0; -} - -/****************************************************************************/ -static int set_h225_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - TransportAddress * addr, - __be32 ip, u_int16_t port) -{ - return set_addr(pskb, data, dataoff, addr->ipAddress.ip, ip, port); -} - -/****************************************************************************/ -static int set_h245_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - __be32 ip, u_int16_t port) -{ - return set_addr(pskb, data, dataoff, - addr->unicastAddress.iPAddress.network, ip, port); -} - -/****************************************************************************/ -static int set_sig_addr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int i; - __be32 ip; - u_int16_t port; - - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port)) { - if (ip == ct->tuplehash[dir].tuple.src.ip && - port == info->sig_port[dir]) { - /* GW->GK */ - - /* Fix for Gnomemeeting */ - if (i > 0 && - get_h225_addr(*data, &addr[0], - &ip, &port) && - (ntohl(ip) & 0xff000000) == 0x7f000000) - i = 0; - - DEBUGP - ("ip_nat_ras: set signal address " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst. - ip), info->sig_port[!dir]); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir]. - tuple.dst.ip, - info->sig_port[!dir]); - } else if (ip == ct->tuplehash[dir].tuple.dst.ip && - port == info->sig_port[dir]) { - /* GK->GW */ - DEBUGP - ("ip_nat_ras: set signal address " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.src. - ip), info->sig_port[!dir]); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir]. - tuple.src.ip, - info->sig_port[!dir]); - } - } - } - - return 0; -} - -/****************************************************************************/ -static int set_ras_addr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - int dir = CTINFO2DIR(ctinfo); - int i; - __be32 ip; - u_int16_t port; - - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && - port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) { - DEBUGP("ip_nat_ras: set rasAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), - ntohs(ct->tuplehash[!dir].tuple.dst.u.udp. - port)); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir].tuple.dst.ip, - ntohs(ct->tuplehash[!dir].tuple. - dst.u.udp.port)); - } - } - - return 0; -} - -/****************************************************************************/ -static int nat_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - u_int16_t port, u_int16_t rtp_port, - struct ip_conntrack_expect *rtp_exp, - struct ip_conntrack_expect *rtcp_exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int i; - u_int16_t nated_port; - - /* Set expectations for NAT */ - rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; - rtp_exp->expectfn = ip_nat_follow_master; - rtp_exp->dir = !dir; - rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; - rtcp_exp->expectfn = ip_nat_follow_master; - rtcp_exp->dir = !dir; - - /* Lookup existing expects */ - for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) { - if (info->rtp_port[i][dir] == rtp_port) { - /* Expected */ - - /* Use allocated ports first. This will refresh - * the expects */ - rtp_exp->tuple.dst.u.udp.port = - htons(info->rtp_port[i][dir]); - rtcp_exp->tuple.dst.u.udp.port = - htons(info->rtp_port[i][dir] + 1); - break; - } else if (info->rtp_port[i][dir] == 0) { - /* Not expected */ - break; - } - } - - /* Run out of expectations */ - if (i >= H323_RTP_CHANNEL_MAX) { - if (net_ratelimit()) - printk("ip_nat_h323: out of expectations\n"); - return 0; - } - - /* Try to get a pair of ports. */ - for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); - nated_port != 0; nated_port += 2) { - rtp_exp->tuple.dst.u.udp.port = htons(nated_port); - if (ip_conntrack_expect_related(rtp_exp) == 0) { - rtcp_exp->tuple.dst.u.udp.port = - htons(nated_port + 1); - if (ip_conntrack_expect_related(rtcp_exp) == 0) - break; - ip_conntrack_unexpect_related(rtp_exp); - } - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_h323: out of RTP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h245_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - (port & 1) ? nated_port + 1 : nated_port) == 0) { - /* Save ports */ - info->rtp_port[i][dir] = rtp_port; - info->rtp_port[i][!dir] = nated_port; - } else { - ip_conntrack_unexpect_related(rtp_exp); - ip_conntrack_unexpect_related(rtcp_exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtp_exp->tuple.src.ip), - ntohs(rtp_exp->tuple.src.u.udp.port), - NIPQUAD(rtp_exp->tuple.dst.ip), - ntohs(rtp_exp->tuple.dst.u.udp.port)); - DEBUGP("ip_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtcp_exp->tuple.src.ip), - ntohs(rtcp_exp->tuple.src.u.udp.port), - NIPQUAD(rtcp_exp->tuple.dst.ip), - ntohs(rtcp_exp->tuple.dst.u.udp.port)); - - return 0; -} - -/****************************************************************************/ -static int nat_t120(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_follow_master; - exp->dir = !dir; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_h323: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h245_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, nated_port) < 0) { - ip_conntrack_unexpect_related(exp); - return -1; - } - - DEBUGP("ip_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/**************************************************************************** - * This conntrack expect function replaces ip_conntrack_h245_expect() - * which was set by ip_conntrack_helper_h323.c. It calls both - * ip_nat_follow_master() and ip_conntrack_h245_expect() - ****************************************************************************/ -static void ip_nat_h245_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - ip_nat_follow_master(new, this); - ip_conntrack_h245_expect(new, this); -} - -/****************************************************************************/ -static int nat_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_h245_expect; - exp->dir = !dir; - - /* Check existing expects */ - if (info->sig_port[dir] == port) - nated_port = info->sig_port[!dir]; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_q931: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h225_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = nated_port; - } else { - ip_conntrack_unexpect_related(exp); - return -1; - } - - DEBUGP("ip_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/**************************************************************************** - * This conntrack expect function replaces ip_conntrack_q931_expect() - * which was set by ip_conntrack_helper_h323.c. - ****************************************************************************/ -static void ip_nat_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - struct ip_nat_range range; - - if (this->tuple.src.ip != 0) { /* Only accept calls from GK */ - ip_nat_follow_master(new, this); - goto out; - } - - /* This must be a fresh one. */ - BUG_ON(new->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = - new->master->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); - - out: - ip_conntrack_q931_expect(new, this); -} - -/****************************************************************************/ -static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, TransportAddress * addr, int idx, - u_int16_t port, struct ip_conntrack_expect *exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - __be32 ip; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_q931_expect; - exp->dir = !dir; - - /* Check existing expects */ - if (info->sig_port[dir] == port) - nated_port = info->sig_port[!dir]; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_ras: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h225_addr(pskb, data, 0, &addr[idx], - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = nated_port; - - /* Fix for Gnomemeeting */ - if (idx > 0 && - get_h225_addr(*data, &addr[0], &ip, &port) && - (ntohl(ip) & 0xff000000) == 0x7f000000) { - set_h225_addr_hook(pskb, data, 0, &addr[0], - ct->tuplehash[!dir].tuple.dst.ip, - info->sig_port[!dir]); - } - } else { - ip_conntrack_unexpect_related(exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/****************************************************************************/ -static void ip_nat_callforwarding_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - struct ip_nat_range range; - - /* This must be a fresh one. */ - BUG_ON(new->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = this->saved_ip; - - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); - - ip_conntrack_q931_expect(new, this); -} - -/****************************************************************************/ -static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port; - - /* Set expectations for NAT */ - exp->saved_ip = exp->tuple.dst.ip; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_callforwarding_expect; - exp->dir = !dir; - - /* Try to get same port: if not, try to change it. */ - for (nated_port = port; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_q931: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (!set_h225_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - ip_conntrack_unexpect_related(exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_q931: expect Call Forwarding " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/****************************************************************************/ -static int __init init(void) -{ - BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL); - BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL); - BUG_ON(rcu_dereference(nat_t120_hook) != NULL); - BUG_ON(rcu_dereference(nat_h245_hook) != NULL); - BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL); - BUG_ON(rcu_dereference(nat_q931_hook) != NULL); - - rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); - rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); - rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); - rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); - rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); - rcu_assign_pointer(nat_t120_hook, nat_t120); - rcu_assign_pointer(nat_h245_hook, nat_h245); - rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); - rcu_assign_pointer(nat_q931_hook, nat_q931); - - DEBUGP("ip_nat_h323: init success\n"); - return 0; -} - -/****************************************************************************/ -static void __exit fini(void) -{ - rcu_assign_pointer(set_h245_addr_hook, NULL); - rcu_assign_pointer(set_h225_addr_hook, NULL); - rcu_assign_pointer(set_sig_addr_hook, NULL); - rcu_assign_pointer(set_ras_addr_hook, NULL); - rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); - rcu_assign_pointer(nat_t120_hook, NULL); - rcu_assign_pointer(nat_h245_hook, NULL); - rcu_assign_pointer(nat_callforwarding_hook, NULL); - rcu_assign_pointer(nat_q931_hook, NULL); - synchronize_rcu(); -} - -/****************************************************************************/ -module_init(init); -module_exit(fini); - -MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); -MODULE_DESCRIPTION("H.323 NAT helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c deleted file mode 100644 index 24ce4a5023d..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * ip_nat_pptp.c - Version 3.0 - * - * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * TODO: - NAT to a unique tuple, not to TCP source port - * (needs netfilter tuple reservation) - * - * Changes: - * 2002-02-10 - Version 1.3 - * - Use ip_nat_mangle_tcp_packet() because of cloned skb's - * in local connections (Philip Craig <philipc@snapgear.com>) - * - add checks for magicCookie and pptp version - * - make argument list of pptp_{out,in}bound_packet() shorter - * - move to C99 style initializers - * - print version number at module loadtime - * 2003-09-22 - Version 1.5 - * - use SNATed tcp sourceport as callid, since we get called before - * TCP header is mangled (Philip Craig <philipc@snapgear.com>) - * 2004-10-22 - Version 2.0 - * - kernel 2.6.x version - * 2005-06-10 - Version 3.0 - * - kernel >= 2.6.11 version, - * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/) - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_pptp.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -#define IP_NAT_PPTP_VERSION "3.0" - -#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off))) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); - - -#if 0 -extern const char *pptp_msg_name[]; -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \ - __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -static void pptp_nat_expected(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *master = ct->master; - struct ip_conntrack_expect *other_exp; - struct ip_conntrack_tuple t; - struct ip_ct_pptp_master *ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info; - struct ip_nat_range range; - - ct_pptp_info = &master->help.ct_pptp_info; - nat_pptp_info = &master->nat.help.nat_pptp_info; - - /* And here goes the grand finale of corrosion... */ - - if (exp->dir == IP_CT_DIR_ORIGINAL) { - DEBUGP("we are PNS->PAC\n"); - /* therefore, build tuple for PAC->PNS */ - t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - t.src.u.gre.key = master->help.ct_pptp_info.pac_call_id; - t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - t.dst.u.gre.key = master->help.ct_pptp_info.pns_call_id; - t.dst.protonum = IPPROTO_GRE; - } else { - DEBUGP("we are PAC->PNS\n"); - /* build tuple for PNS->PAC */ - t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - t.src.u.gre.key = master->nat.help.nat_pptp_info.pns_call_id; - t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - t.dst.u.gre.key = master->nat.help.nat_pptp_info.pac_call_id; - t.dst.protonum = IPPROTO_GRE; - } - - DEBUGP("trying to unexpect other dir: "); - DUMP_TUPLE(&t); - other_exp = ip_conntrack_expect_find_get(&t); - if (other_exp) { - ip_conntrack_unexpect_related(other_exp); - ip_conntrack_expect_put(other_exp); - DEBUGP("success\n"); - } else { - DEBUGP("not found!\n"); - } - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.ip; - if (exp->dir == IP_CT_DIR_ORIGINAL) { - range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; - } - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.ip; - if (exp->dir == IP_CT_DIR_REPLY) { - range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; - } - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); -} - -/* outbound packets == from PNS to PAC */ -static int -pptp_outbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) - -{ - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - u_int16_t msg; - __be16 new_callid; - unsigned int cid_off; - - new_callid = ct_pptp_info->pns_call_id; - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REQUEST: - cid_off = offsetof(union pptp_ctrl_union, ocreq.callID); - /* FIXME: ideally we would want to reserve a call ID - * here. current netfilter NAT core is not able to do - * this :( For now we use TCP source port. This breaks - * multiple calls within one control session */ - - /* save original call ID in nat_info */ - nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; - - /* don't use tcph->source since we are at a DSTmanip - * hook (e.g. PREROUTING) and pkt is not mangled yet */ - new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - /* save new call ID in ct info */ - ct_pptp_info->pns_call_id = new_callid; - break; - case PPTP_IN_CALL_REPLY: - cid_off = offsetof(union pptp_ctrl_union, icack.callID); - break; - case PPTP_CALL_CLEAR_REQUEST: - cid_off = offsetof(union pptp_ctrl_union, clrreq.callID); - break; - default: - DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, - (msg <= PPTP_MSG_MAX)? - pptp_msg_name[msg]:pptp_msg_name[0]); - /* fall through */ - - case PPTP_SET_LINK_INFO: - /* only need to NAT in case PAC is behind NAT box */ - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass - * down to here */ - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); - - /* mangle packet */ - if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - cid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)) == 0) - return NF_DROP; - - return NF_ACCEPT; -} - -static void -pptp_exp_gre(struct ip_conntrack_expect *expect_orig, - struct ip_conntrack_expect *expect_reply) -{ - struct ip_conntrack *ct = expect_orig->master; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - /* save original PAC call ID in nat_info */ - nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; - - /* alter expectation for PNS->PAC direction */ - expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id; - expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id; - expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id; - expect_orig->dir = IP_CT_DIR_ORIGINAL; - - /* alter expectation for PAC->PNS direction */ - expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id; - expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id; - expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id; - expect_reply->dir = IP_CT_DIR_REPLY; -} - -/* inbound packets == from PAC to PNS */ -static int -pptp_inbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) -{ - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - u_int16_t msg; - __be16 new_pcid; - unsigned int pcid_off; - - new_pcid = nat_pptp_info->pns_call_id; - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REPLY: - pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID); - break; - case PPTP_IN_CALL_CONNECT: - pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID); - break; - case PPTP_IN_CALL_REQUEST: - /* only need to nat in case PAC is behind NAT box */ - return NF_ACCEPT; - case PPTP_WAN_ERROR_NOTIFY: - pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID); - break; - case PPTP_CALL_DISCONNECT_NOTIFY: - pcid_off = offsetof(union pptp_ctrl_union, disc.callID); - break; - case PPTP_SET_LINK_INFO: - pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); - break; - - default: - DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)? - pptp_msg_name[msg]:pptp_msg_name[0]); - /* fall through */ - - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST, - * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */ - - /* mangle packet */ - DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", - ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); - - if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - pcid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)) == 0) - return NF_DROP; - return NF_ACCEPT; -} - - -extern int __init ip_nat_proto_gre_init(void); -extern void __exit ip_nat_proto_gre_fini(void); - -static int __init ip_nat_helper_pptp_init(void) -{ - int ret; - - DEBUGP("%s: registering NAT helper\n", __FILE__); - - ret = ip_nat_proto_gre_init(); - if (ret < 0) - return ret; - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_outbound)); - rcu_assign_pointer(ip_nat_pptp_hook_outbound, pptp_outbound_pkt); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_inbound)); - rcu_assign_pointer(ip_nat_pptp_hook_inbound, pptp_inbound_pkt); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_exp_gre)); - rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, pptp_exp_gre); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_expectfn)); - rcu_assign_pointer(ip_nat_pptp_hook_expectfn, pptp_nat_expected); - - printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION); - return 0; -} - -static void __exit ip_nat_helper_pptp_fini(void) -{ - DEBUGP("cleanup_module\n" ); - - rcu_assign_pointer(ip_nat_pptp_hook_expectfn, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_inbound, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_outbound, NULL); - synchronize_rcu(); - - ip_nat_proto_gre_fini(); - - printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION); -} - -module_init(ip_nat_helper_pptp_init); -module_exit(ip_nat_helper_pptp_fini); diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c deleted file mode 100644 index cfaeea38314..00000000000 --- a/net/ipv4/netfilter/ip_nat_irc.c +++ /dev/null @@ -1,122 +0,0 @@ -/* IRC extension for TCP NAT alteration. - * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> - * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * based on a copy of RR's ip_nat_ftp.c - * - * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/kernel.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_irc.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/moduleparam.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("IRC (DCC) NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp) -{ - u_int16_t port; - unsigned int ret; - - /* "4294967296 65635 " */ - char buffer[18]; - - DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n", - expect->seq, exp_irc_info->len, - ntohl(tcph->seq)); - - /* Reply comes from server. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_REPLY; - - /* When you see the packet, we need to NAT it the same as the - * this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 - * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 - * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 - * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 - * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 - * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, - * 255.255.255.255==4294967296, 10 digits) - * P: bound port (min 1 d, max 5d (65635)) - * F: filename (min 1 d ) - * S: size (min 1 d ) - * 0x01, \n: terminators - */ - - /* AAA = "us", ie. where server normally talks to. */ - sprintf(buffer, "%u %u", - ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip), - port); - DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n", - buffer, NIPQUAD(exp->tuple.src.ip), port); - - ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, - matchoff, matchlen, buffer, - strlen(buffer)); - if (ret != NF_ACCEPT) - ip_conntrack_unexpect_related(exp); - return ret; -} - -static void __exit ip_nat_irc_fini(void) -{ - rcu_assign_pointer(ip_nat_irc_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_irc_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_irc_hook)); - rcu_assign_pointer(ip_nat_irc_hook, help); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(ip_nat_irc_init); -module_exit(ip_nat_irc_fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c deleted file mode 100644 index 95810202d84..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * ip_nat_proto_gre.c - Version 2.0 - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \ - __FUNCTION__, ## args) -#else -#define DEBUGP(x, args...) -#endif - -/* is key in given range between min and max */ -static int -gre_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 key; - - if (maniptype == IP_NAT_MANIP_SRC) - key = tuple->src.u.gre.key; - else - key = tuple->dst.u.gre.key; - - return ntohs(key) >= ntohs(min->gre.key) - && ntohs(key) <= ntohs(max->gre.key); -} - -/* generate unique tuple ... */ -static int -gre_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t key; - __be16 *keyptr; - unsigned int min, i, range_size; - - if (maniptype == IP_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - DEBUGP("%p: NATing GRE PPTP\n", conntrack); - min = 1; - range_size = 0xffff; - } else { - min = ntohs(range->min.gre.key); - range_size = ntohs(range->max.gre.key) - min + 1; - } - - DEBUGP("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; i < range_size; i++, key++) { - *keyptr = htons(min + key % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - - DEBUGP("%p: no NAT mapping\n", conntrack); - - return 0; -} - -/* manipulate a GRE packet according to maniptype */ -static int -gre_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - unsigned int hdroff = iphdroff + iph->ihl*4; - - /* pgreh includes two optional 32bit fields which are not required - * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8)) - return 0; - - greh = (void *)(*pskb)->data + hdroff; - pgreh = (struct gre_hdr_pptp *) greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype == IP_NAT_MANIP_DST) { - /* key manipulation is always dest */ - switch (greh->version) { - case 0: - if (!greh->key) { - DEBUGP("can't nat GRE w/o key\n"); - break; - } - if (greh->csum) { - /* FIXME: Never tested this code... */ - nf_proto_csum_replace4(gre_csum(greh), *pskb, - *(gre_key(greh)), - tuple->dst.u.gre.key, 0); - } - *(gre_key(greh)) = tuple->dst.u.gre.key; - break; - case GRE_VERSION_PPTP: - DEBUGP("call_id -> 0x%04x\n", - ntohs(tuple->dst.u.gre.key)); - pgreh->call_id = tuple->dst.u.gre.key; - break; - default: - DEBUGP("can't nat unknown GRE version\n"); - return 0; - break; - } - } - return 1; -} - -/* nat helper struct */ -static struct ip_nat_protocol gre = { - .name = "GRE", - .protonum = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, - .in_range = gre_in_range, - .unique_tuple = gre_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; - -int __init ip_nat_proto_gre_init(void) -{ - return ip_nat_protocol_register(&gre); -} - -void __exit ip_nat_proto_gre_fini(void) -{ - ip_nat_protocol_unregister(&gre); -} diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c deleted file mode 100644 index 22a528ae038..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ /dev/null @@ -1,87 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/icmp.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int -icmp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && - ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - -static int -icmp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t id; - unsigned int range_size; - unsigned int i; - - range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) - range_size = 0xFFFF; - - for (i = 0; i < range_size; i++, id++) { - tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + - (id % range_size)); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - return 0; -} - -static int -icmp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct icmphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - - if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) - return 0; - - hdr = (struct icmphdr *)((*pskb)->data + hdroff); - nf_proto_csum_replace2(&hdr->checksum, *pskb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); - hdr->un.echo.id = tuple->src.u.icmp.id; - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_icmp = { - .name = "ICMP", - .protonum = IPPROTO_ICMP, - .me = THIS_MODULE, - .manip_pkt = icmp_manip_pkt, - .in_range = icmp_in_range, - .unique_tuple = icmp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c deleted file mode 100644 index 14ff24f53a7..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ /dev/null @@ -1,154 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/if.h> -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> - -static int -tcp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.tcp.port; - else - port = tuple->dst.u.tcp.port; - - return ntohs(port) >= ntohs(min->tcp.port) - && ntohs(port) <= ntohs(max->tcp.port); -} - -static int -tcp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.tcp.port; - else - portptr = &tuple->dst.u.tcp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - /* Map privileged onto privileged. */ - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.tcp.port); - range_size = ntohs(range->max.tcp.port) - min + 1; - } - - /* Start from random port to avoid prediction */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) { - return 1; - } - } - return 0; -} - -static int -tcp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct tcphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport, oldport; - int hdrsize = 8; /* TCP connection tracking guarantees this much */ - - /* this could be a inner header returned in icmp packet; in such - cases we cannot update the checksum field since it is outside of - the 8 bytes of transport layer headers we are guaranteed */ - if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) - hdrsize = sizeof(struct tcphdr); - - if (!skb_make_writable(pskb, hdroff + hdrsize)) - return 0; - - iph = (struct iphdr *)((*pskb)->data + iphdroff); - hdr = (struct tcphdr *)((*pskb)->data + hdroff); - - if (maniptype == IP_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.ip; - newport = tuple->src.u.tcp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.ip; - newport = tuple->dst.u.tcp.port; - portptr = &hdr->dest; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return 1; - - nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, *pskb, oldport, newport, 0); - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_tcp = { - .name = "TCP", - .protonum = IPPROTO_TCP, - .me = THIS_MODULE, - .manip_pkt = tcp_manip_pkt, - .in_range = tcp_in_range, - .unique_tuple = tcp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c deleted file mode 100644 index dfd52167289..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ /dev/null @@ -1,144 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int -udp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.udp.port; - else - port = tuple->dst.u.udp.port; - - return ntohs(port) >= ntohs(min->udp.port) - && ntohs(port) <= ntohs(max->udp.port); -} - -static int -udp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.udp.port; - else - portptr = &tuple->dst.u.udp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.udp.port); - range_size = ntohs(range->max.udp.port) - min + 1; - } - - /* Start from random port to avoid prediction */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - return 0; -} - -static int -udp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct udphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport; - - if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) - return 0; - - iph = (struct iphdr *)((*pskb)->data + iphdroff); - hdr = (struct udphdr *)((*pskb)->data + hdroff); - - if (maniptype == IP_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.ip; - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.ip; - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - - if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) { - nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, *pskb, *portptr, newport, 0); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - } - *portptr = newport; - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_udp = { - .name = "UDP", - .protonum = IPPROTO_UDP, - .me = THIS_MODULE, - .manip_pkt = udp_manip_pkt, - .in_range = udp_in_range, - .unique_tuple = udp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c deleted file mode 100644 index 3bf04951724..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ /dev/null @@ -1,55 +0,0 @@ -/* The "unknown" protocol. This is what is used for protocols we - * don't understand. It's returned by ip_ct_find_proto(). - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/netfilter.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int unknown_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type manip_type, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - return 1; -} - -static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - /* Sorry: we can't help you; if it's not unique, we can't frob - anything. */ - return 0; -} - -static int -unknown_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - return 1; -} - -struct ip_nat_protocol ip_nat_unknown_protocol = { - .name = "unknown", - /* .me isn't set: getting a ref to this cannot fail. */ - .manip_pkt = unknown_manip_pkt, - .in_range = unknown_in_range, - .unique_tuple = unknown_unique_tuple, -}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c deleted file mode 100644 index 080eb1d9220..00000000000 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ /dev/null @@ -1,314 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* Everything about the rules for NAT. */ -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/bitops.h> - -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) - -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} nat_initial_table __initdata -= { { "nat", NAT_VALID_HOOKS, 4, - sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - { [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, - { [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, - 0, NULL, { } }, - { - /* PRE_ROUTING */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } }, - /* POST_ROUTING */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } }, - /* LOCAL_OUT */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } } - }, - /* ERROR */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_error), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, - { } }, - "ERROR" - } - } -}; - -static struct xt_table nat_table = { - .name = "nat", - .valid_hooks = NAT_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE, - .af = AF_INET, -}; - -/* Source NAT */ -static unsigned int ipt_snat_target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - const struct ip_nat_multi_range_compat *mr = targinfo; - - IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* Connection must be valid and new. */ - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); - IP_NF_ASSERT(out); - - return ip_nat_setup_info(ct, &mr->range[0], hooknum); -} - -/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ -static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) -{ - static int warned = 0; - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; - struct rtable *rt; - - if (ip_route_output_key(&rt, &fl) != 0) - return; - - if (rt->rt_src != srcip && !warned) { - printk("NAT: no longer support implicit source local NAT\n"); - printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n", - NIPQUAD(srcip), NIPQUAD(dstip)); - warned = 1; - } - ip_rt_put(rt); -} - -static unsigned int ipt_dnat_target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - const struct ip_nat_multi_range_compat *mr = targinfo; - - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_LOCAL_OUT); - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* Connection must be valid and new. */ - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - if (hooknum == NF_IP_LOCAL_OUT - && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle((*pskb)->nh.iph->daddr, - mr->range[0].min_ip); - - return ip_nat_setup_info(ct, &mr->range[0], hooknum); -} - -static int ipt_snat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct ip_nat_multi_range_compat *mr = targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - printk("SNAT: multiple ranges no longer supported\n"); - return 0; - } - return 1; -} - -static int ipt_dnat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct ip_nat_multi_range_compat *mr = targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - printk("DNAT: multiple ranges no longer supported\n"); - return 0; - } - if (mr->range[0].flags & IP_NAT_RANGE_PROTO_RANDOM) { - printk("DNAT: port randomization not supported\n"); - return 0; - } - return 1; -} - -inline unsigned int -alloc_null_binding(struct ip_conntrack *conntrack, - struct ip_nat_info *info, - unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - Use reply in case it's already been mangled (eg local packet). - */ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - struct ip_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; - - DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, - NIPQUAD(ip)); - return ip_nat_setup_info(conntrack, &range, hooknum); -} - -unsigned int -alloc_null_binding_confirmed(struct ip_conntrack *conntrack, - struct ip_nat_info *info, - unsigned int hooknum) -{ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - u_int16_t all - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all); - struct ip_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } }; - - DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", - conntrack, NIPQUAD(ip)); - return ip_nat_setup_info(conntrack, &range, hooknum); -} - -int ip_nat_rule_find(struct sk_buff **pskb, - unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct ip_conntrack *ct, - struct ip_nat_info *info) -{ - int ret; - - ret = ipt_do_table(pskb, hooknum, in, out, &nat_table); - - if (ret == NF_ACCEPT) { - if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - /* NUL mapping */ - ret = alloc_null_binding(ct, info, hooknum); - } - return ret; -} - -static struct xt_target ipt_snat_reg = { - .name = "SNAT", - .family = AF_INET, - .target = ipt_snat_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), - .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, - .checkentry = ipt_snat_checkentry, -}; - -static struct xt_target ipt_dnat_reg = { - .name = "DNAT", - .family = AF_INET, - .target = ipt_dnat_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), - .checkentry = ipt_dnat_checkentry, -}; - -int __init ip_nat_rule_init(void) -{ - int ret; - - ret = ipt_register_table(&nat_table, &nat_initial_table.repl); - if (ret != 0) - return ret; - ret = xt_register_target(&ipt_snat_reg); - if (ret != 0) - goto unregister_table; - - ret = xt_register_target(&ipt_dnat_reg); - if (ret != 0) - goto unregister_snat; - - return ret; - - unregister_snat: - xt_unregister_target(&ipt_snat_reg); - unregister_table: - xt_unregister_table(&nat_table); - - return ret; -} - -void ip_nat_rule_cleanup(void) -{ - xt_unregister_target(&ipt_dnat_reg); - xt_unregister_target(&ipt_snat_reg); - ipt_unregister_table(&nat_table); -} diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c deleted file mode 100644 index 325c5a9dc2e..00000000000 --- a/net/ipv4/netfilter/ip_nat_sip.c +++ /dev/null @@ -1,282 +0,0 @@ -/* SIP extension for UDP NAT alteration. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_nat_ftp.c and other modules. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_sip.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP NAT helper"); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -struct addr_map { - struct { - char src[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - char dst[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned int srclen, srciplen; - unsigned int dstlen, dstiplen; - } addr[IP_CT_DIR_MAX]; -}; - -static void addr_map_init(struct ip_conntrack *ct, struct addr_map *map) -{ - struct ip_conntrack_tuple *t; - enum ip_conntrack_dir dir; - unsigned int n; - - for (dir = 0; dir < IP_CT_DIR_MAX; dir++) { - t = &ct->tuplehash[dir].tuple; - - n = sprintf(map->addr[dir].src, "%u.%u.%u.%u", - NIPQUAD(t->src.ip)); - map->addr[dir].srciplen = n; - n += sprintf(map->addr[dir].src + n, ":%u", - ntohs(t->src.u.udp.port)); - map->addr[dir].srclen = n; - - n = sprintf(map->addr[dir].dst, "%u.%u.%u.%u", - NIPQUAD(t->dst.ip)); - map->addr[dir].dstiplen = n; - n += sprintf(map->addr[dir].dst + n, ":%u", - ntohs(t->dst.u.udp.port)); - map->addr[dir].dstlen = n; - } -} - -static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, const char **dptr, size_t dlen, - enum sip_header_pos pos, struct addr_map *map) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff, addrlen; - char *addr; - - if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0) - return 1; - - if ((matchlen == map->addr[dir].srciplen || - matchlen == map->addr[dir].srclen) && - memcmp(*dptr + matchoff, map->addr[dir].src, matchlen) == 0) { - addr = map->addr[!dir].dst; - addrlen = map->addr[!dir].dstlen; - } else if ((matchlen == map->addr[dir].dstiplen || - matchlen == map->addr[dir].dstlen) && - memcmp(*dptr + matchoff, map->addr[dir].dst, matchlen) == 0) { - addr = map->addr[!dir].src; - addrlen = map->addr[!dir].srclen; - } else - return 1; - - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, addr, addrlen)) - return 0; - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - return 1; - -} - -static unsigned int ip_nat_sip(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr) -{ - enum sip_header_pos pos; - struct addr_map map; - int dataoff, datalen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - datalen = (*pskb)->len - dataoff; - if (datalen < sizeof("SIP/2.0") - 1) - return NF_DROP; - - addr_map_init(ct, &map); - - /* Basic rules: requests and responses. */ - if (strncmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) != 0) { - /* 10.2: Constructing the REGISTER Request: - * - * The "userinfo" and "@" components of the SIP URI MUST NOT - * be present. - */ - if (datalen >= sizeof("REGISTER") - 1 && - strncmp(*dptr, "REGISTER", sizeof("REGISTER") - 1) == 0) - pos = POS_REG_REQ_URI; - else - pos = POS_REQ_URI; - - if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, pos, &map)) - return NF_DROP; - } - - if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_FROM, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_TO, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_VIA, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_CONTACT, &map)) - return NF_DROP; - return NF_ACCEPT; -} - -static unsigned int mangle_sip_packet(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr, size_t dlen, - char *buffer, int bufflen, - enum sip_header_pos pos) -{ - unsigned int matchlen, matchoff; - - if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0) - return 0; - - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, buffer, bufflen)) - return 0; - - /* We need to reload this. Thanks Patrick. */ - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - return 1; -} - -static int mangle_content_len(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char *dptr) -{ - unsigned int dataoff, matchoff, matchlen; - char buffer[sizeof("65536")]; - int bufflen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - - /* Get actual SDP lenght */ - if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, - &matchlen, POS_SDP_HEADER) > 0) { - - /* since ct_sip_get_info() give us a pointer passing 'v=' - we need to add 2 bytes in this count. */ - int c_len = (*pskb)->len - dataoff - matchoff + 2; - - /* Now, update SDP lenght */ - if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, - &matchlen, POS_CONTENT) > 0) { - - bufflen = sprintf(buffer, "%u", c_len); - - return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, - buffer, bufflen); - } - } - return 0; -} - -static unsigned int mangle_sdp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - __be32 newip, u_int16_t port, - const char *dptr) -{ - char buffer[sizeof("nnn.nnn.nnn.nnn")]; - unsigned int dataoff, bufflen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - - /* Mangle owner and contact info. */ - bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_OWNER)) - return 0; - - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_CONNECTION)) - return 0; - - /* Mangle media port. */ - bufflen = sprintf(buffer, "%u", port); - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_MEDIA)) - return 0; - - return mangle_content_len(pskb, ctinfo, ct, dptr); -} - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp, - const char *dptr) -{ - struct ip_conntrack *ct = exp->master; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - __be32 newip; - u_int16_t port; - - DEBUGP("ip_nat_sdp():\n"); - - /* Connection will come from reply */ - newip = ct->tuplehash[!dir].tuple.dst.ip; - - exp->tuple.dst.ip = newip; - exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { - exp->tuple.dst.u.udp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { - ip_conntrack_unexpect_related(exp); - return NF_DROP; - } - return NF_ACCEPT; -} - -static void __exit fini(void) -{ - rcu_assign_pointer(ip_nat_sip_hook, NULL); - rcu_assign_pointer(ip_nat_sdp_hook, NULL); - synchronize_rcu(); -} - -static int __init init(void) -{ - BUG_ON(rcu_dereference(ip_nat_sip_hook)); - BUG_ON(rcu_dereference(ip_nat_sdp_hook)); - rcu_assign_pointer(ip_nat_sip_hook, ip_nat_sip); - rcu_assign_pointer(ip_nat_sdp_hook, ip_nat_sdp); - return 0; -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c deleted file mode 100644 index e41d0efae51..00000000000 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ /dev/null @@ -1,1333 +0,0 @@ -/* - * ip_nat_snmp_basic.c - * - * Basic SNMP Application Layer Gateway - * - * This IP NAT module is intended for use with SNMP network - * discovery and monitoring applications where target networks use - * conflicting private address realms. - * - * Static NAT is used to remap the networks from the view of the network - * management system at the IP layer, and this module remaps some application - * layer addresses to match. - * - * The simplest form of ALG is performed, where only tagged IP addresses - * are modified. The module does not need to be MIB aware and only scans - * messages at the ASN.1/BER level. - * - * Currently, only SNMPv1 and SNMPv2 are supported. - * - * More information on ALG and associated issues can be found in - * RFC 2962 - * - * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory - * McLean & Jochen Friedrich, stripped down for use in the kernel. - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Author: James Morris <jmorris@intercode.com.au> - * - * Updates: - * 2000-08-06: Convert to new helper API (Harald Welte). - * - */ -#include <linux/in.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/moduleparam.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> -#include <asm/uaccess.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); - -#define SNMP_PORT 161 -#define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)n) - -static int debug; -static DEFINE_SPINLOCK(snmp_lock); - -/* - * Application layer address mapping mimics the NAT mapping, but - * only for the first octet in this case (a more flexible system - * can be implemented if needed). - */ -struct oct1_map -{ - u_int8_t from; - u_int8_t to; -}; - - -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -/* - * ASN.1 context. - */ -struct asn1_ctx -{ - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr -{ - unsigned char *data; - unsigned int len; -}; - -static void asn1_open(struct asn1_ctx *ctx, - unsigned char *buf, - unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do - { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char asn1_length_decode(struct asn1_ctx *ctx, - unsigned int *def, - unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = (unsigned char) (ch & 0x7F); - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - return 1; -} - -static unsigned char asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned int def, len; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - def = len = 0; - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - unsigned char ch; - - if (eoc == 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } - return 1; - } -} - -static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, - unsigned int *len) -{ - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) { - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} - -static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, - unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long **oid, - unsigned int *len) -{ - unsigned long subid; - unsigned int size; - unsigned long *optr; - - size = eoc - ctx->pointer + 1; - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) { - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr [0] = 0; - optr [1] = subid; - } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; - } else { - optr [0] = 2; - optr [1] = subid - 80; - } - - *len = 2; - optr += 2; - - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } - - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } - } - return 1; -} - -/***************************************************************************** - * - * SNMP decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* SNMP Versions */ -#define SNMP_V1 0 -#define SNMP_V2C 1 -#define SNMP_V2 2 -#define SNMP_V3 3 - -/* Default Sizes */ -#define SNMP_SIZE_COMM 256 -#define SNMP_SIZE_OBJECTID 128 -#define SNMP_SIZE_BUFCHR 256 -#define SNMP_SIZE_BUFINT 128 -#define SNMP_SIZE_SMALLOBJECTID 16 - -/* Requests */ -#define SNMP_PDU_GET 0 -#define SNMP_PDU_NEXT 1 -#define SNMP_PDU_RESPONSE 2 -#define SNMP_PDU_SET 3 -#define SNMP_PDU_TRAP1 4 -#define SNMP_PDU_BULK 5 -#define SNMP_PDU_INFORM 6 -#define SNMP_PDU_TRAP2 7 - -/* Errors */ -#define SNMP_NOERROR 0 -#define SNMP_TOOBIG 1 -#define SNMP_NOSUCHNAME 2 -#define SNMP_BADVALUE 3 -#define SNMP_READONLY 4 -#define SNMP_GENERROR 5 -#define SNMP_NOACCESS 6 -#define SNMP_WRONGTYPE 7 -#define SNMP_WRONGLENGTH 8 -#define SNMP_WRONGENCODING 9 -#define SNMP_WRONGVALUE 10 -#define SNMP_NOCREATION 11 -#define SNMP_INCONSISTENTVALUE 12 -#define SNMP_RESOURCEUNAVAILABLE 13 -#define SNMP_COMMITFAILED 14 -#define SNMP_UNDOFAILED 15 -#define SNMP_AUTHORIZATIONERROR 16 -#define SNMP_NOTWRITABLE 17 -#define SNMP_INCONSISTENTNAME 18 - -/* General SNMP V1 Traps */ -#define SNMP_TRAP_COLDSTART 0 -#define SNMP_TRAP_WARMSTART 1 -#define SNMP_TRAP_LINKDOWN 2 -#define SNMP_TRAP_LINKUP 3 -#define SNMP_TRAP_AUTFAILURE 4 -#define SNMP_TRAP_EQPNEIGHBORLOSS 5 -#define SNMP_TRAP_ENTSPECIFIC 6 - -/* SNMPv1 Types */ -#define SNMP_NULL 0 -#define SNMP_INTEGER 1 /* l */ -#define SNMP_OCTETSTR 2 /* c */ -#define SNMP_DISPLAYSTR 2 /* c */ -#define SNMP_OBJECTID 3 /* ul */ -#define SNMP_IPADDR 4 /* uc */ -#define SNMP_COUNTER 5 /* ul */ -#define SNMP_GAUGE 6 /* ul */ -#define SNMP_TIMETICKS 7 /* ul */ -#define SNMP_OPAQUE 8 /* c */ - -/* Additional SNMPv2 Types */ -#define SNMP_UINTEGER 5 /* ul */ -#define SNMP_BITSTR 9 /* uc */ -#define SNMP_NSAP 10 /* uc */ -#define SNMP_COUNTER64 11 /* ul */ -#define SNMP_NOSUCHOBJECT 12 -#define SNMP_NOSUCHINSTANCE 13 -#define SNMP_ENDOFMIBVIEW 14 - -union snmp_syntax -{ - unsigned char uc[0]; /* 8 bit unsigned */ - char c[0]; /* 8 bit signed */ - unsigned long ul[0]; /* 32 bit unsigned */ - long l[0]; /* 32 bit signed */ -}; - -struct snmp_object -{ - unsigned long *id; - unsigned int id_len; - unsigned short type; - unsigned int syntax_len; - union snmp_syntax syntax; -}; - -struct snmp_request -{ - unsigned long id; - unsigned int error_status; - unsigned int error_index; -}; - -struct snmp_v1_trap -{ - unsigned long *id; - unsigned int id_len; - unsigned long ip_address; /* pointer */ - unsigned int general; - unsigned int specific; - unsigned long time; -}; - -/* SNMP types */ -#define SNMP_IPA 0 -#define SNMP_CNT 1 -#define SNMP_GGE 2 -#define SNMP_TIT 3 -#define SNMP_OPQ 4 -#define SNMP_C64 6 - -/* SNMP errors */ -#define SERR_NSO 0 -#define SERR_NSI 1 -#define SERR_EOM 2 - -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check); -struct snmp_cnv -{ - unsigned int class; - unsigned int tag; - int syntax; -}; - -static struct snmp_cnv snmp_conv [] = -{ - {ASN1_UNI, ASN1_NUL, SNMP_NULL}, - {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, - {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, - {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, - {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, - {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, - {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ - {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ - {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, - {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, - - /* SNMPv2 data types and errors */ - {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, - {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, - {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, - {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, - {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, - {0, 0, -1} -}; - -static unsigned char snmp_tag_cls2syntax(unsigned int tag, - unsigned int cls, - unsigned short *syntax) -{ - struct snmp_cnv *cnv; - - cnv = snmp_conv; - - while (cnv->syntax != -1) { - if (cnv->tag == tag && cnv->class == cls) { - *syntax = cnv->syntax; - return 1; - } - cnv++; - } - return 0; -} - -static unsigned char snmp_object_decode(struct asn1_ctx *ctx, - struct snmp_object **obj) -{ - unsigned int cls, con, tag, len, idlen; - unsigned short type; - unsigned char *eoc, *end, *p; - unsigned long *lp, *id; - unsigned long ul; - long l; - - *obj = NULL; - id = NULL; - - if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &id, &idlen)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { - kfree(id); - return 0; - } - - if (con != ASN1_PRI) { - kfree(id); - return 0; - } - - type = 0; - if (!snmp_tag_cls2syntax(tag, cls, &type)) { - kfree(id); - return 0; - } - - l = 0; - switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.c, p, len); - kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); - kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); - kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: - kfree(id); - return 0; - } - - (*obj)->syntax_len = len; - (*obj)->type = type; - (*obj)->id = id; - (*obj)->id_len = idlen; - - if (!asn1_eoc_decode(ctx, eoc)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - return 1; -} - -static unsigned char snmp_request_decode(struct asn1_ctx *ctx, - struct snmp_request *request) -{ - unsigned int cls, con, tag; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_ulong_decode(ctx, end, &request->id)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_status)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_index)) - return 0; - - return 1; -} - -/* - * Fast checksum update for possibly oddly-aligned UDP byte, from the - * code example in the draft. - */ -static void fast_csum(__sum16 *csum, - const unsigned char *optr, - const unsigned char *nptr, - int offset) -{ - unsigned char s[4]; - - if (offset & 1) { - s[0] = s[2] = 0; - s[1] = ~*optr; - s[3] = *nptr; - } else { - s[1] = s[3] = 0; - s[0] = ~*optr; - s[2] = *nptr; - } - - *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); -} - -/* - * Mangle IP address. - * - begin points to the start of the snmp messgae - * - addr points to the start of the address - */ -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check) -{ - if (map->from == NOCT1(addr)) { - u_int32_t old; - - if (debug) - memcpy(&old, (unsigned char *)addr, sizeof(old)); - - *addr = map->to; - - /* Update UDP checksum if being used */ - if (*check) { - fast_csum(check, - &map->from, &map->to, addr - begin); - } - - if (debug) - printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " - "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); - } -} - -static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, - struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned int cls, con, tag, len; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_id_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) - goto err_id_free; - - if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) - goto err_id_free; - - /* IPv4 only */ - if (len != 4) - goto err_addr_free; - - mangle_address(ctx->begin, ctx->pointer - 4, map, check); - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->general)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->specific)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) - goto err_addr_free; - - if (!asn1_ulong_decode(ctx, end, &trap->time)) - goto err_addr_free; - - return 1; - -err_addr_free: - kfree((unsigned long *)trap->ip_address); - -err_id_free: - kfree(trap->id); - - return 0; -} - -/***************************************************************************** - * - * Misc. routines - * - *****************************************************************************/ - -static void hex_dump(unsigned char *buf, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (i && !(i % 16)) - printk("\n"); - printk("%02x ", *(buf + i)); - } - printk("\n"); -} - -/* - * Parse and mangle SNMP message according to mapping. - * (And this is the fucking 'basic' method). - */ -static int snmp_parse_mangle(unsigned char *msg, - u_int16_t len, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned char *eoc, *end; - unsigned int cls, con, tag, vers, pdutype; - struct asn1_ctx ctx; - struct asn1_octstr comm; - struct snmp_object **obj; - - if (debug > 1) - hex_dump(msg, len); - - asn1_open(&ctx, msg, len); - - /* - * Start of SNMP message. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - /* - * Version 1 or 2 handled. - */ - if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - if (!asn1_uint_decode (&ctx, end, &vers)) - return 0; - if (debug > 1) - printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); - if (vers > 1) - return 1; - - /* - * Community. - */ - if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) - return 0; - if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) - return 0; - if (debug > 1) { - unsigned int i; - - printk(KERN_DEBUG "bsalg: community: "); - for (i = 0; i < comm.len; i++) - printk("%c", comm.data[i]); - printk("\n"); - } - kfree(comm.data); - - /* - * PDU type - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) - return 0; - if (cls != ASN1_CTX || con != ASN1_CON) - return 0; - if (debug > 1) { - unsigned char *pdus[] = { - [SNMP_PDU_GET] = "get", - [SNMP_PDU_NEXT] = "get-next", - [SNMP_PDU_RESPONSE] = "response", - [SNMP_PDU_SET] = "set", - [SNMP_PDU_TRAP1] = "trapv1", - [SNMP_PDU_BULK] = "bulk", - [SNMP_PDU_INFORM] = "inform", - [SNMP_PDU_TRAP2] = "trapv2" - }; - - if (pdutype > SNMP_PDU_TRAP2) - printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); - else - printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); - } - if (pdutype != SNMP_PDU_RESPONSE && - pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) - return 1; - - /* - * Request header or v1 trap - */ - if (pdutype == SNMP_PDU_TRAP1) { - struct snmp_v1_trap trap; - unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); - - if (ret) { - kfree(trap.id); - kfree((unsigned long *)trap.ip_address); - } else - return ret; - - } else { - struct snmp_request req; - - if (!snmp_request_decode(&ctx, &req)) - return 0; - - if (debug > 1) - printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " - "error_index=%u\n", req.id, req.error_status, - req.error_index); - } - - /* - * Loop through objects, look for IP addresses to mangle. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (obj == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); - return 0; - } - - while (!asn1_eoc_decode(&ctx, eoc)) { - unsigned int i; - - if (!snmp_object_decode(&ctx, obj)) { - if (*obj) { - kfree((*obj)->id); - kfree(*obj); - } - kfree(obj); - return 0; - } - - if (debug > 1) { - printk(KERN_DEBUG "bsalg: object: "); - for (i = 0; i < (*obj)->id_len; i++) { - if (i > 0) - printk("."); - printk("%lu", (*obj)->id[i]); - } - printk(": type=%u\n", (*obj)->type); - - } - - if ((*obj)->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4 , map, check); - - kfree((*obj)->id); - kfree(*obj); - } - kfree(obj); - - if (!asn1_eoc_decode(&ctx, eoc)) - return 0; - - return 1; -} - -/***************************************************************************** - * - * NAT routines. - * - *****************************************************************************/ - -/* - * SNMP translation routine. - */ -static int snmp_translate(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - u_int16_t udplen = ntohs(udph->len); - u_int16_t paylen = udplen - sizeof(struct udphdr); - int dir = CTINFO2DIR(ctinfo); - struct oct1_map map; - - /* - * Determine mappping for application layer addresses based - * on NAT manipulations for the packet. - */ - if (dir == IP_CT_DIR_ORIGINAL) { - /* SNAT traps */ - map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip); - map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip); - } else { - /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip); - } - - if (map.from == map.to) - return NF_ACCEPT; - - if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), - paylen, &map, &udph->check)) { - if (net_ratelimit()) - printk(KERN_WARNING "bsalg: parser failed\n"); - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We don't actually set up expectations, just adjust internal IP - * addresses if this is being NATted */ -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - int dir = CTINFO2DIR(ctinfo); - unsigned int ret; - struct iphdr *iph = (*pskb)->nh.iph; - struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); - - /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) - return NF_ACCEPT; - if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* No NAT? */ - if (!(ct->status & IPS_NAT_MASK)) - return NF_ACCEPT; - - /* - * Make sure the packet length is ok. So far, we were only guaranteed - * to have a valid length IP header plus 8 bytes, which means we have - * enough room for a UDP header. Just verify the UDP length field so we - * can mess around with the payload. - */ - if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) { - if (net_ratelimit()) - printk(KERN_WARNING "SNMP: dropping malformed packet " - "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - return NF_DROP; - } - - if (!skb_make_writable(pskb, (*pskb)->len)) - return NF_DROP; - - spin_lock_bh(&snmp_lock); - ret = snmp_translate(ct, ctinfo, pskb); - spin_unlock_bh(&snmp_lock); - return ret; -} - -static struct ip_conntrack_helper snmp_helper = { - .max_expected = 0, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "snmp", - - .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}, - }, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}, - }, -}; - -static struct ip_conntrack_helper snmp_trap_helper = { - .max_expected = 0, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "snmp_trap", - - .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_TRAP_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}, - }, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}, - }, -}; - -/***************************************************************************** - * - * Module stuff. - * - *****************************************************************************/ - -static int __init ip_nat_snmp_basic_init(void) -{ - int ret = 0; - - ret = ip_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; - ret = ip_conntrack_helper_register(&snmp_trap_helper); - if (ret < 0) { - ip_conntrack_helper_unregister(&snmp_helper); - return ret; - } - return ret; -} - -static void __exit ip_nat_snmp_basic_fini(void) -{ - ip_conntrack_helper_unregister(&snmp_helper); - ip_conntrack_helper_unregister(&snmp_trap_helper); -} - -module_init(ip_nat_snmp_basic_init); -module_exit(ip_nat_snmp_basic_fini); - -module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c deleted file mode 100644 index 6bcfdf6dfcc..00000000000 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ /dev/null @@ -1,388 +0,0 @@ -/* This file contains all the functions required for the standalone - ip_nat module. - - These are not required by the compatibility layer. -*/ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> - * - new API and handling of conntrack/nat helpers - * - now capable of multiple expectations for one master - * */ - -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/spinlock.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -#ifdef CONFIG_XFRM -static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) -{ - struct ip_conntrack *ct; - struct ip_conntrack_tuple *t; - enum ip_conntrack_info ctinfo; - enum ip_conntrack_dir dir; - unsigned long statusbit; - - ct = ip_conntrack_get(skb, &ctinfo); - if (ct == NULL) - return; - dir = CTINFO2DIR(ctinfo); - t = &ct->tuplehash[dir].tuple; - - if (dir == IP_CT_DIR_ORIGINAL) - statusbit = IPS_DST_NAT; - else - statusbit = IPS_SRC_NAT; - - if (ct->status & statusbit) { - fl->fl4_dst = t->dst.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) - fl->fl_ip_dport = t->dst.u.tcp.port; - } - - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - fl->fl4_src = t->src.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) - fl->fl_ip_sport = t->src.u.tcp.port; - } -} -#endif - -static unsigned int -ip_nat_fn(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - struct ip_nat_info *info; - /* maniptype == SRC for postrouting. */ - enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - and local-out, and ip_nat_out protects post-routing. */ - IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off - & htons(IP_MF|IP_OFFSET))); - - ct = ip_conntrack_get(*pskb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - have dropped it. Hence it's the user's responsibilty to - packet filter it out, or implement conntrack/NAT for that - protocol. 8) --RR */ - if (!ct) { - /* Exception: ICMP redirect to new connection (not in - hash table yet). We must not let this through, in - case we're doing NAT to the same network. */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { - struct icmphdr _hdr, *hp; - - hp = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4, - sizeof(_hdr), &_hdr); - if (hp != NULL && - hp->type == ICMP_REDIRECT) - return NF_DROP; - } - return NF_ACCEPT; - } - - /* Don't try to NAT if this packet is not conntracked */ - if (ct == &ip_conntrack_untracked) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { - if (!ip_nat_icmp_reply_translation(ct, ctinfo, - hooknum, pskb)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - info = &ct->nat.info; - - /* Seen it before? This can happen for loopback, retrans, - or local packets.. */ - if (!ip_nat_initialized(ct, maniptype)) { - unsigned int ret; - - if (unlikely(is_confirmed(ct))) - /* NAT module was loaded late */ - ret = alloc_null_binding_confirmed(ct, info, - hooknum); - else if (hooknum == NF_IP_LOCAL_IN) - /* LOCAL_IN hook doesn't have a chain! */ - ret = alloc_null_binding(ct, info, hooknum); - else - ret = ip_nat_rule_find(pskb, hooknum, - in, out, ct, - info); - - if (ret != NF_ACCEPT) { - return ret; - } - } else - DEBUGP("Already setup manip %s for ct %p\n", - maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - break; - - default: - /* ESTABLISHED */ - IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED - || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); - info = &ct->nat.info; - } - - IP_NF_ASSERT(info); - return ip_nat_packet(ct, ctinfo, hooknum, pskb); -} - -static unsigned int -ip_nat_in(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - unsigned int ret; - __be32 daddr = (*pskb)->nh.iph->daddr; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN - && daddr != (*pskb)->nh.iph->daddr) { - dst_release((*pskb)->dst); - (*pskb)->dst = NULL; - } - return ret; -} - -static unsigned int -ip_nat_out(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#ifdef CONFIG_XFRM - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN - && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip - || ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all - ) - return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP; - } -#endif - return ret; -} - -static unsigned int -ip_nat_local_fn(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN - && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.ip != - ct->tuplehash[!dir].tuple.src.ip) { - if (ip_route_me_harder(pskb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (ip_xfrm_me_harder(pskb)) - ret = NF_DROP; -#endif - - } - return ret; -} - -static unsigned int -ip_nat_adjust(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { - DEBUGP("ip_nat_standalone: adjusting sequence number\n"); - if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We must be after connection tracking and before packet filtering. */ - -static struct nf_hook_ops ip_nat_ops[] = { - /* Before packet filtering, change destination */ - { - .hook = ip_nat_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = ip_nat_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* After conntrack, adjust sequence number */ - { - .hook = ip_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, - /* Before packet filtering, change destination */ - { - .hook = ip_nat_local_fn, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = ip_nat_fn, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* After conntrack, adjust sequence number */ - { - .hook = ip_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, -}; - -static int __init ip_nat_standalone_init(void) -{ - int ret = 0; - - need_conntrack(); - -#ifdef CONFIG_XFRM - BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; -#endif - ret = ip_nat_rule_init(); - if (ret < 0) { - printk("ip_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; - } - ret = nf_register_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); - if (ret < 0) { - printk("ip_nat_init: can't register hooks.\n"); - goto cleanup_rule_init; - } - return ret; - - cleanup_rule_init: - ip_nat_rule_cleanup(); - cleanup_decode_session: -#ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; - synchronize_net(); -#endif - return ret; -} - -static void __exit ip_nat_standalone_fini(void) -{ - nf_unregister_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); - ip_nat_rule_cleanup(); -#ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; - synchronize_net(); -#endif -} - -module_init(ip_nat_standalone_init); -module_exit(ip_nat_standalone_fini); - -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c deleted file mode 100644 index 604793536fc..00000000000 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ /dev/null @@ -1,70 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Version: 0.0.7 - * - * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> - * - Port to newnat API - * - * This module currently supports DNAT: - * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y - * - * and SNAT: - * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x } - * - * It has not been tested with - * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip - * If you do test this please let me know if it works or not. - * - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/moduleparam.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("tftp NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *ct = exp->master; - - exp->saved_proto.udp.port - = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; - exp->dir = IP_CT_DIR_REPLY; - exp->expectfn = ip_nat_follow_master; - if (ip_conntrack_expect_related(exp) != 0) - return NF_DROP; - return NF_ACCEPT; -} - -static void __exit ip_nat_tftp_fini(void) -{ - rcu_assign_pointer(ip_nat_tftp_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_tftp_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_tftp_hook)); - rcu_assign_pointer(ip_nat_tftp_hook, help); - return 0; -} - -module_init(ip_nat_tftp_init); -module_exit(ip_nat_tftp_fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index a14798a850d..702d94db19b 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -8,18 +8,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). - * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). - * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian - * Zander). - * 2000-08-01: Added Nick Williams' MAC support. - * 2002-06-25: Code cleanup. - * 2005-01-10: Added /proc counter for dropped packets; fixed so - * packets aren't delivered to user space if they're going - * to be dropped. - * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) - * */ #include <linux/module.h> #include <linux/skbuff.h> @@ -191,12 +179,13 @@ ipq_flush(int verdict) static struct sk_buff * ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) { - unsigned char *old_tail; + sk_buff_data_t old_tail; size_t size = 0; size_t data_len = 0; struct sk_buff *skb; struct ipq_packet_msg *pmsg; struct nlmsghdr *nlh; + struct timeval tv; read_lock_bh(&queue_lock); @@ -234,15 +223,16 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) if (!skb) goto nlmsg_failure; - old_tail= skb->tail; + old_tail = skb->tail; nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); pmsg = NLMSG_DATA(nlh); memset(pmsg, 0, sizeof(*pmsg)); pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->tstamp.off_sec; - pmsg->timestamp_usec = entry->skb->tstamp.off_usec; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; @@ -378,7 +368,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; - memcpy(e->skb->data, v->payload, v->data_len); + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; @@ -495,7 +485,7 @@ ipq_rcv_skb(struct sk_buff *skb) if (skblen < sizeof(*nlh)) return; - nlh = (struct nlmsghdr *)skb->data; + nlh = nlmsg_hdr(skb); nlmsglen = nlh->nlmsg_len; if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) return; @@ -678,7 +668,7 @@ static int __init ip_queue_init(void) netlink_register_notifier(&ipq_nl_notifier); ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, - THIS_MODULE); + NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 50cc4b92e28..e3f83bf160d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -7,12 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> - * - increase module usage count as soon as we have rules inside - * a table - * 08 Oct 2005 Harald Welte <lafore@netfilter.org> - * - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables" */ #include <linux/cache.h> #include <linux/capability.h> @@ -198,7 +192,7 @@ int do_match(struct ipt_entry_match *m, { /* Stop iteration if it doesn't match */ if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, skb->nh.iph->ihl*4, hotdrop)) + offset, ip_hdrlen(skb), hotdrop)) return 1; else return 0; @@ -231,7 +225,7 @@ ipt_do_table(struct sk_buff **pskb, struct xt_table_info *private; /* Initialization */ - ip = (*pskb)->nh.iph; + ip = ip_hdr(*pskb); datalen = (*pskb)->len - ip->ihl * 4; indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; @@ -320,7 +314,7 @@ ipt_do_table(struct sk_buff **pskb, = 0x57acc001; #endif /* Target might have changed stuff. */ - ip = (*pskb)->nh.iph; + ip = ip_hdr(*pskb); datalen = (*pskb)->len - ip->ihl * 4; if (verdict == IPT_CONTINUE) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e965b333c99..40e27342139 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -21,15 +21,12 @@ #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> - -#include <net/checksum.h> - #include <linux/netfilter_arp.h> - #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/checksum.h> #define CLUSTERIP_VERSION "0.8" @@ -240,7 +237,7 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) static inline u_int32_t clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); unsigned long hashval; u_int16_t sport, dport; u_int16_t *ports; @@ -310,15 +307,16 @@ target(struct sk_buff **pskb, const void *targinfo) { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; - u_int32_t *mark, hash; + u_int32_t hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - mark = nf_ct_get_mark((*pskb), &ctinfo); - if (mark == NULL) { + ct = nf_ct_get(*pskb, &ctinfo); + if (ct == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -328,7 +326,7 @@ target(struct sk_buff **pskb, /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) return XT_CONTINUE; @@ -341,7 +339,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - *mark = hash; + ct->mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -358,7 +356,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, *mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; @@ -411,12 +409,10 @@ checkentry(const char *tablename, "has invalid config pointer!\n"); return 0; } - clusterip_config_entry_get(cipinfo->config); } else { /* Case B: This is a new rule referring to an existing * clusterip config. */ cipinfo->config = config; - clusterip_config_entry_get(cipinfo->config); } } else { /* Case C: This is a completely new clusterip config */ @@ -523,7 +519,7 @@ arp_mangle(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct arphdr *arp = (*pskb)->nh.arph; + struct arphdr *arp = arp_hdr(*pskb); struct arp_payload *payload; struct clusterip_config *c; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 4f565633631..918ca92e534 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -5,14 +5,13 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp */ #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/tcp.h> #include <net/checksum.h> @@ -29,13 +28,13 @@ MODULE_DESCRIPTION("iptables ECN modification module"); static inline int set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) { - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); oldtos = iph->tos; iph->tos &= ~IPT_ECN_IP_MASK; iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); @@ -52,7 +51,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) __be16 oldval; /* Not enought header? */ - tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + tcph = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_tcph), &_tcph); if (!tcph) return 0; @@ -63,9 +62,9 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph))) return 0; - tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + tcph = (void *)ip_hdr(*pskb) + ip_hdrlen(*pskb); oldval = ((__be16 *)tcph)[6]; if (einfo->operation & IPT_ECN_OP_SET_ECE) @@ -93,7 +92,7 @@ target(struct sk_buff **pskb, return NF_DROP; if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) - && (*pskb)->nh.iph->protocol == IPPROTO_TCP) + && ip_hdr(*pskb)->protocol == IPPROTO_TCP) if (!set_ect_tcp(pskb, einfo)) return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index d9c37fd9422..a42c5cd968b 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -399,9 +399,9 @@ ipt_log_packet(unsigned int pf, /* MAC logging for input chain only. */ printk("MAC="); if (skb->dev && skb->dev->hard_header_len - && skb->mac.raw != (void*)skb->nh.iph) { + && skb->mac_header != skb->network_header) { int i; - unsigned char *p = skb->mac.raw; + const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) printk("%02x%c", *p, i==skb->dev->hard_header_len - 1 @@ -477,14 +477,10 @@ static int __init ipt_log_init(void) ret = xt_register_target(&ipt_log_reg); if (ret < 0) return ret; - if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { - printk(KERN_WARNING "ipt_LOG: not logging via system console " - "since somebody else already registered for PF_INET\n"); - /* we cannot make module load fail here, since otherwise - * iptables userspace would abort */ - } - - return 0; + ret = nf_log_register(PF_INET, &ipt_log_logger); + if (ret < 0 && ret != -EEXIST) + xt_unregister_target(&ipt_log_reg); + return ret; } static void __exit ipt_log_fini(void) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index b5955f3a3f8..d4f2d777533 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -19,12 +19,8 @@ #include <net/ip.h> #include <net/checksum.h> #include <net/route.h> -#include <linux/netfilter_ipv4.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); @@ -48,7 +44,7 @@ masquerade_check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { DEBUGP("masquerade_check: bad MAP_IPS.\n"); @@ -69,33 +65,26 @@ masquerade_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { -#ifdef CONFIG_NF_NAT_NEEDED + struct nf_conn *ct; struct nf_conn_nat *nat; -#endif - struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - struct ip_nat_range newrange; - const struct ip_nat_multi_range_compat *mr; + struct nf_nat_range newrange; + const struct nf_nat_multi_range_compat *mr; struct rtable *rt; __be32 newsrc; - IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); - ct = ip_conntrack_get(*pskb, &ctinfo); -#ifdef CONFIG_NF_NAT_NEEDED + ct = nf_ct_get(*pskb, &ctinfo); nat = nfct_nat(ct); -#endif - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ -#ifdef CONFIG_NF_NAT_NEEDED if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) -#else - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0) -#endif return NF_ACCEPT; mr = targinfo; @@ -107,40 +96,30 @@ masquerade_target(struct sk_buff **pskb, } write_lock_bh(&masq_lock); -#ifdef CONFIG_NF_NAT_NEEDED nat->masq_index = out->ifindex; -#else - ct->nat.masq_index = out->ifindex; -#endif write_unlock_bh(&masq_lock); /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, newsrc, newsrc, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static inline int -device_cmp(struct ip_conntrack *i, void *ifindex) +device_cmp(struct nf_conn *i, void *ifindex) { - int ret; -#ifdef CONFIG_NF_NAT_NEEDED struct nf_conn_nat *nat = nfct_nat(i); + int ret; if (!nat) return 0; -#endif read_lock_bh(&masq_lock); -#ifdef CONFIG_NF_NAT_NEEDED ret = (nat->masq_index == (int)(long)ifindex); -#else - ret = (i->nat.masq_index == (int)(long)ifindex); -#endif read_unlock_bh(&masq_lock); return ret; @@ -156,9 +135,9 @@ static int masq_device_event(struct notifier_block *this, /* Device was downed. Search entire table for conntracks which were associated with that device, and forget them. */ - IP_NF_ASSERT(dev->ifindex != 0); + NF_CT_ASSERT(dev->ifindex != 0); - ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); } return NOTIFY_DONE; @@ -174,9 +153,9 @@ static int masq_inet_event(struct notifier_block *this, /* IP address was deleted. Search entire table for conntracks which were associated with that device, and forget them. */ - IP_NF_ASSERT(dev->ifindex != 0); + NF_CT_ASSERT(dev->ifindex != 0); - ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); } return NOTIFY_DONE; @@ -194,7 +173,7 @@ static struct xt_target masquerade = { .name = "MASQUERADE", .family = AF_INET, .target = masquerade_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = 1 << NF_IP_POST_ROUTING, .checkentry = masquerade_check, diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index fd7aaa347cd..068c69bce30 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -16,11 +16,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif #define MODULENAME "NETMAP" MODULE_LICENSE("GPL"); @@ -40,7 +36,7 @@ check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); @@ -61,39 +57,39 @@ target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; - const struct ip_nat_multi_range_compat *mr = targinfo; - struct ip_nat_range newrange; + const struct nf_nat_multi_range_compat *mr = targinfo; + struct nf_nat_range newrange; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_OUT); - ct = ip_conntrack_get(*pskb, &ctinfo); + ct = nf_ct_get(*pskb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) - new_ip = (*pskb)->nh.iph->daddr & ~netmask; + new_ip = ip_hdr(*pskb)->daddr & ~netmask; else - new_ip = (*pskb)->nh.iph->saddr & ~netmask; + new_ip = ip_hdr(*pskb)->saddr & ~netmask; new_ip |= mr->range[0].min_ip & netmask; - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, new_ip, new_ip, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target target_module = { .name = MODULENAME, .family = AF_INET, .target = target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_OUT), diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index c2b6b80670f..68cc76a198e 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -19,11 +19,7 @@ #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -43,7 +39,7 @@ redirect_check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { DEBUGP("redirect_check: bad MAP_IPS.\n"); @@ -64,17 +60,17 @@ redirect_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - const struct ip_nat_multi_range_compat *mr = targinfo; - struct ip_nat_range newrange; + const struct nf_nat_multi_range_compat *mr = targinfo; + struct nf_nat_range newrange; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT); - ct = ip_conntrack_get(*pskb, &ctinfo); - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + ct = nf_ct_get(*pskb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ if (hooknum == NF_IP_LOCAL_OUT) @@ -96,20 +92,20 @@ redirect_target(struct sk_buff **pskb, } /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, newdst, newdst, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target redirect_reg = { .name = "REDIRECT", .family = AF_INET, .target = redirect_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), .checkentry = redirect_check, diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 80f739e2182..9041e0741f6 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -1,7 +1,5 @@ /* * This is a module which is used for rejecting packets. - * Added support for customized reject packets (Jozsef Kadlecsik). - * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812] */ /* (C) 1999-2001 Paul `Rusty' Russell @@ -43,7 +41,7 @@ MODULE_DESCRIPTION("iptables REJECT target module"); static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct iphdr *iph = oldskb->nh.iph; + struct iphdr *niph; struct tcphdr _otcph, *oth, *tcph; __be16 tmp_port; __be32 tmp_addr; @@ -51,10 +49,10 @@ static void send_reset(struct sk_buff *oldskb, int hook) unsigned int addr_type; /* IP header checks: fragment. */ - if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return; - oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(_otcph), &_otcph); if (oth == NULL) return; @@ -64,7 +62,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) return; /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP)) + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; /* We need a linear, writeable skb. We also need to expand @@ -84,20 +82,21 @@ static void send_reset(struct sk_buff *oldskb, int hook) skb_shinfo(nskb)->gso_segs = 0; skb_shinfo(nskb)->gso_type = 0; - tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); + tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb)); /* Swap source and dest */ - tmp_addr = nskb->nh.iph->saddr; - nskb->nh.iph->saddr = nskb->nh.iph->daddr; - nskb->nh.iph->daddr = tmp_addr; + niph = ip_hdr(nskb); + tmp_addr = niph->saddr; + niph->saddr = niph->daddr; + niph->daddr = tmp_addr; tmp_port = tcph->source; tcph->source = tcph->dest; tcph->dest = tmp_port; /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; - skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); - nskb->nh.iph->tot_len = htons(nskb->len); + skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr)); + niph->tot_len = htons(nskb->len); if (tcph->ack) { needs_ack = 0; @@ -105,9 +104,9 @@ static void send_reset(struct sk_buff *oldskb, int hook) tcph->ack_seq = 0; } else { needs_ack = 1; - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin - + oldskb->len - oldskb->nh.iph->ihl*4 - - (oth->doff<<2)); + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); tcph->seq = 0; } @@ -122,14 +121,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* Adjust TCP checksum */ tcph->check = 0; tcph->check = tcp_v4_check(sizeof(struct tcphdr), - nskb->nh.iph->saddr, - nskb->nh.iph->daddr, + niph->saddr, niph->daddr, csum_partial((char *)tcph, sizeof(struct tcphdr), 0)); /* Set DF, id = 0 */ - nskb->nh.iph->frag_off = htons(IP_DF); - nskb->nh.iph->id = 0; + niph->frag_off = htons(IP_DF); + niph->id = 0; addr_type = RTN_UNSPEC; if (hook != NF_IP_FORWARD @@ -145,12 +143,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->ip_summed = CHECKSUM_NONE; /* Adjust IP TTL */ - nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); + niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); /* Adjust IP checksum */ - nskb->nh.iph->check = 0; - nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, - nskb->nh.iph->ihl); + niph->check = 0; + niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl); /* "Never happens" */ if (nskb->len > dst_mtu(nskb->dst)) @@ -182,7 +179,7 @@ static unsigned int reject(struct sk_buff **pskb, /* Our naive response construction doesn't deal with IP options, and probably shouldn't try. */ - if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + if (ip_hdrlen(*pskb) != sizeof(struct iphdr)) return NF_DROP; /* WARNING: This code causes reentry within iptables. diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c index bd4404e5c68..511e5ff8493 100644 --- a/net/ipv4/netfilter/ipt_SAME.c +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -7,21 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 010320 Martin Josefsson <gandalf@wlug.westbo.se> - * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things. - * 010728 Martin Josefsson <gandalf@wlug.westbo.se> - * * added --nodst to not include destination-ip in new source - * calculations. - * * added some more sanity-checks. - * 010729 Martin Josefsson <gandalf@wlug.westbo.se> - * * fixed a buggy if-statement in same_check(), should have - * used ntohl() but didn't. - * * added support for multiple ranges. IPT_SAME_MAX_RANGE is - * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h - * and is currently set to 10. - * * added support for 1-address range, nice to have now that - * we have multiple ranges. */ #include <linux/types.h> #include <linux/ip.h> @@ -35,11 +20,7 @@ #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif #include <linux/netfilter_ipv4/ipt_SAME.h> MODULE_LICENSE("GPL"); @@ -138,17 +119,17 @@ same_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t tmpip, aindex; __be32 new_ip; const struct ipt_same_info *same = targinfo; - struct ip_nat_range newrange; - const struct ip_conntrack_tuple *t; + struct nf_nat_range newrange; + const struct nf_conntrack_tuple *t; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING); - ct = ip_conntrack_get(*pskb, &ctinfo); + ct = nf_ct_get(*pskb, &ctinfo); t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; @@ -157,17 +138,10 @@ same_target(struct sk_buff **pskb, Here we calculate the index in same->iparray which holds the ipaddress we should use */ -#ifdef CONFIG_NF_NAT_NEEDED tmpip = ntohl(t->src.u3.ip); if (!(same->info & IPT_SAME_NODST)) tmpip += ntohl(t->dst.u3.ip); -#else - tmpip = ntohl(t->src.ip); - - if (!(same->info & IPT_SAME_NODST)) - tmpip += ntohl(t->dst.ip); -#endif aindex = tmpip % same->ipnum; new_ip = htonl(same->iparray[aindex]); @@ -178,13 +152,13 @@ same_target(struct sk_buff **pskb, NIPQUAD(new_ip)); /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { same->range[0].flags, new_ip, new_ip, /* FIXME: Use ports from correct range! */ same->range[0].min, same->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target same_reg = { diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index cedf9f7d9d6..0ad02f24983 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -29,13 +29,13 @@ target(struct sk_buff **pskb, const void *targinfo) { const struct ipt_tos_target_info *tosinfo = targinfo; - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { __u8 oldtos; if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); oldtos = iph->tos; iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 64be31c22ba..a991ec7bd4e 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -32,7 +32,7 @@ ipt_ttl_target(struct sk_buff **pskb, if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); switch (info->mode) { case IPT_TTL_SET: diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index a26404dbe21..23b607b33b3 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -2,20 +2,6 @@ * netfilter module for userspace packet logging daemons * * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> - * - * 2000/09/22 ulog-cprange feature added - * 2001/01/04 in-kernel queue as proposed by Sebastian Zander - * <zander@fokus.gmd.de> - * 2001/01/30 per-rule nlgroup conflicts with global queue. - * nlgroup now global (sysctl) - * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at - * module loadtime -HW - * 2002/07/07 remove broken nflog_rcv() function -HW - * 2002/08/29 fix shifted/unshifted nlgroup bug -HW - * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen> - * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT - * resulting in bogus 'error during NLMSG_PUT' messages. - * * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * @@ -42,8 +28,6 @@ * flushtimeout: * Specify, after how many hundredths of a second the queue should be * flushed even if it is not full yet. - * - * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp */ #include <linux/module.h> @@ -61,6 +45,7 @@ #include <linux/netfilter_ipv4/ipt_ULOG.h> #include <net/sock.h> #include <linux/bitops.h> +#include <asm/unaligned.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); @@ -186,6 +171,7 @@ static void ipt_ulog_packet(unsigned int hooknum, ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; + struct timeval tv; /* ffs == find first bit set, necessary because userspace * is already shifting groupnumber, but we need unshifted. @@ -231,14 +217,15 @@ static void ipt_ulog_packet(unsigned int hooknum, pm = NLMSG_DATA(nlh); /* We might not have a timestamp, get one */ - if (skb->tstamp.off_sec == 0) + if (skb->tstamp.tv64 == 0) __net_timestamp((struct sk_buff *)skb); /* copy hook, prefix, timestamp, payload, etc. */ pm->data_len = copy_len; - pm->timestamp_sec = skb->tstamp.off_sec; - pm->timestamp_usec = skb->tstamp.off_usec; - pm->mark = skb->mark; + tv = ktime_to_timeval(skb->tstamp); + put_unaligned(tv.tv_sec, &pm->timestamp_sec); + put_unaligned(tv.tv_usec, &pm->timestamp_usec); + put_unaligned(skb->mark, &pm->mark); pm->hook = hooknum; if (prefix != NULL) strncpy(pm->prefix, prefix, sizeof(pm->prefix)); @@ -248,9 +235,9 @@ static void ipt_ulog_packet(unsigned int hooknum, *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 - && skb->mac.raw != (void *) skb->nh.iph + && skb->mac_header != skb->network_header && in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb->mac.raw, in->hard_header_len); + memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); pm->mac_len = in->hard_header_len; } else pm->mac_len = 0; @@ -362,12 +349,52 @@ static int ipt_ulog_checkentry(const char *tablename, return 1; } +#ifdef CONFIG_COMPAT +struct compat_ipt_ulog_info { + compat_uint_t nl_group; + compat_size_t copy_range; + compat_size_t qthreshold; + char prefix[ULOG_PREFIX_LEN]; +}; + +static void compat_from_user(void *dst, void *src) +{ + struct compat_ipt_ulog_info *cl = src; + struct ipt_ulog_info l = { + .nl_group = cl->nl_group, + .copy_range = cl->copy_range, + .qthreshold = cl->qthreshold, + }; + + memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); + memcpy(dst, &l, sizeof(l)); +} + +static int compat_to_user(void __user *dst, void *src) +{ + struct ipt_ulog_info *l = src; + struct compat_ipt_ulog_info cl = { + .nl_group = l->nl_group, + .copy_range = l->copy_range, + .qthreshold = l->qthreshold, + }; + + memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); + return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + static struct xt_target ipt_ulog_reg = { .name = "ULOG", .family = AF_INET, .target = ipt_ulog_target, .targetsize = sizeof(struct ipt_ulog_info), .checkentry = ipt_ulog_checkentry, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ipt_ulog_info), + .compat_from_user = compat_from_user, + .compat_to_user = compat_to_user, +#endif .me = THIS_MODULE, }; @@ -389,14 +416,11 @@ static int __init ipt_ulog_init(void) } /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - init_timer(&ulog_buffers[i].timer); - ulog_buffers[i].timer.function = ulog_timer; - ulog_buffers[i].timer.data = i; - } + for (i = 0; i < ULOG_MAXNLGROUPS; i++) + setup_timer(&ulog_buffers[i].timer, ulog_timer, i); nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - THIS_MODULE); + NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index cfa0472617f..a652a145155 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -33,7 +33,7 @@ static int match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_addrtype_info *info = matchinfo; - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); int ret = 1; if (info->source) diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 37508b2cfea..26218122f86 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -1,7 +1,5 @@ /* IP tables module for matching the value of the IPv4 and TCP ECN bits * - * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp - * * (C) 2002 by Harald Welte <laforge@gnumonks.org> * * This program is free software; you can redistribute it and/or modify @@ -11,6 +9,7 @@ #include <linux/in.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/tcp.h> @@ -26,7 +25,7 @@ MODULE_LICENSE("GPL"); static inline int match_ip(const struct sk_buff *skb, const struct ipt_ecn_info *einfo) { - return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); + return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; } static inline int match_tcp(const struct sk_buff *skb, @@ -38,8 +37,7 @@ static inline int match_tcp(const struct sk_buff *skb, /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ - th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) { *hotdrop = 0; return 0; @@ -80,7 +78,7 @@ static int match(const struct sk_buff *skb, return 0; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { - if (skb->nh.iph->protocol != IPPROTO_TCP) + if (ip_hdr(skb)->protocol != IPPROTO_TCP) return 0; if (!match_tcp(skb, info, hotdrop)) return 0; diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c index bc5d5e6091e..33af9e94088 100644 --- a/net/ipv4/netfilter/ipt_iprange.c +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -32,7 +32,7 @@ match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_iprange_info *info = matchinfo; - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); if (info->flags & IPRANGE_SRC) { if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index aecb9c48e15..15a9e8bbb7c 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -183,11 +183,11 @@ ipt_recent_match(const struct sk_buff *skb, int ret = info->invert; if (info->side == IPT_RECENT_DEST) - addr = skb->nh.iph->daddr; + addr = ip_hdr(skb)->daddr; else - addr = skb->nh.iph->saddr; + addr = ip_hdr(skb)->saddr; - ttl = skb->nh.iph->ttl; + ttl = ip_hdr(skb)->ttl; /* use TTL as seen before forwarding */ if (out && !skb->sk) ttl++; diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c index 5d33b51d49d..d314844af12 100644 --- a/net/ipv4/netfilter/ipt_tos.c +++ b/net/ipv4/netfilter/ipt_tos.c @@ -30,7 +30,7 @@ match(const struct sk_buff *skb, { const struct ipt_tos_info *info = matchinfo; - return (skb->nh.iph->tos == info->tos) ^ info->invert; + return (ip_hdr(skb)->tos == info->tos) ^ info->invert; } static struct xt_match tos_match = { diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index 1eca9f40037..ab02d9e3139 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -1,7 +1,5 @@ /* IP tables module for matching the value of the TTL * - * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp - * * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> * * This program is free software; you can redistribute it and/or modify @@ -26,19 +24,20 @@ static int match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_ttl_info *info = matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; switch (info->mode) { case IPT_TTL_EQ: - return (skb->nh.iph->ttl == info->ttl); + return (ttl == info->ttl); break; case IPT_TTL_NE: - return (!(skb->nh.iph->ttl == info->ttl)); + return (!(ttl == info->ttl)); break; case IPT_TTL_LT: - return (skb->nh.iph->ttl < info->ttl); + return (ttl < info->ttl); break; case IPT_TTL_GT: - return (skb->nh.iph->ttl > info->ttl); + return (ttl > info->ttl); break; default: printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index d1d61e97b97..42728909eba 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -102,7 +103,7 @@ ipt_local_out_hook(unsigned int hook, { /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 98b66ef0c71..9278802f274 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -7,8 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * Extended to all five netfilter hooks by Brad Chapman & Harald Welte */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -17,6 +15,7 @@ #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> +#include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -130,13 +129,14 @@ ipt_local_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { unsigned int ret; + const struct iphdr *iph; u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; @@ -144,19 +144,23 @@ ipt_local_hook(unsigned int hook, /* Save things which could affect route */ mark = (*pskb)->mark; - saddr = (*pskb)->nh.iph->saddr; - daddr = (*pskb)->nh.iph->daddr; - tos = (*pskb)->nh.iph->tos; + iph = ip_hdr(*pskb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; ret = ipt_do_table(pskb, hook, in, out, &packet_mangler); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE - && ((*pskb)->nh.iph->saddr != saddr - || (*pskb)->nh.iph->daddr != daddr - || (*pskb)->mark != mark - || (*pskb)->nh.iph->tos != tos)) - if (ip_route_me_harder(pskb, RTN_UNSPEC)) - ret = NF_DROP; + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + iph = ip_hdr(*pskb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + (*pskb)->mark != mark || + iph->tos != tos) + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; + } return ret; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8f3e92d20df..0654eaae70c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -4,14 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - move L3 protocol dependent part to this file. - * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - add get_features() to support various size of conntrack - * structures. - * - * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c */ #include <linux/types.h> @@ -87,7 +79,7 @@ nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) local_bh_enable(); if (skb) - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); return skb; } @@ -97,16 +89,16 @@ ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, u_int8_t *protonum) { /* Never happen */ - if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (ip_hdr(*pskb)->frag_off & htons(IP_OFFSET)) { if (net_ratelimit()) { printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", - (*pskb)->nh.iph->protocol, hooknum); + ip_hdr(*pskb)->protocol, hooknum); } return -NF_DROP; } - *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; - *protonum = (*pskb)->nh.iph->protocol; + *dataoff = skb_network_offset(*pskb) + ip_hdrlen(*pskb); + *protonum = ip_hdr(*pskb)->protocol; return NF_ACCEPT; } @@ -152,9 +144,8 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, return NF_ACCEPT; return help->helper->help(pskb, - (*pskb)->nh.raw - (*pskb)->data - + (*pskb)->nh.iph->ihl*4, - ct, ctinfo); + skb_network_offset(*pskb) + ip_hdrlen(*pskb), + ct, ctinfo); } static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, @@ -171,7 +162,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #endif /* Gather fragments. */ - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET)) { *pskb = nf_ct_ipv4_gather_frags(*pskb, hooknum == NF_IP_PRE_ROUTING ? IP_DEFRAG_CONNTRACK_IN : @@ -199,7 +190,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, { /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5fd1e5363c1..f4fc657c198 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -4,11 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with Layer 3 protocol independent connection tracking. - * - * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c */ #include <linux/types.h> @@ -158,7 +153,7 @@ icmp_error_message(struct sk_buff *skb, NF_CT_ASSERT(skb->nfct == NULL); /* Not enough header? */ - inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + inside = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_in), &_in); if (inside == NULL) return -NF_ACCEPT; @@ -172,7 +167,7 @@ icmp_error_message(struct sk_buff *skb, /* rcu_read_lock()ed by nf_hook_slow */ innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + dataoff = ip_hdrlen(skb) + sizeof(inside->icmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, inside->ip.protocol, &origtuple, @@ -227,7 +222,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, struct icmphdr _ih, *icmph; /* Not enough header? */ - icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 452e9d32668..ea02f00d2da 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -431,7 +431,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, } *inside; struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple inner, target; - int hdrlen = (*pskb)->nh.iph->ihl * 4; + int hdrlen = ip_hdrlen(*pskb); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned long statusbit; enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); @@ -439,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside = (void *)(*pskb)->data + ip_hdrlen(*pskb); /* We're actually going to mangle it beyond trivial checksum adjustment, so make sure the current checksum is correct. */ @@ -469,9 +469,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); if (!nf_ct_get_tuple(*pskb, - (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr), - (*pskb)->nh.iph->ihl*4 + - sizeof(struct icmphdr) + inside->ip.ihl*4, + ip_hdrlen(*pskb) + sizeof(struct icmphdr), + (ip_hdrlen(*pskb) + + sizeof(struct icmphdr) + inside->ip.ihl * 4), (u_int16_t)AF_INET, inside->ip.protocol, &inner, l3proto, l4proto)) @@ -483,14 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, packet: PREROUTING (DST manip), routing produces ICMP, goes through POSTROUTING (which must correct the DST manip). */ if (!manip_pkt(inside->ip.protocol, pskb, - (*pskb)->nh.iph->ihl*4 + sizeof(inside->icmp), + ip_hdrlen(*pskb) + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside = (void *)(*pskb)->data + ip_hdrlen(*pskb); inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9cbf3f9be13..fcebc968d37 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -33,7 +33,7 @@ static int set_addr(struct sk_buff **pskb, unsigned int addroff, __be32 ip, __be16 port) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ip_conntrack_get(*pskb, &ctinfo); + struct nf_conn *ct = nf_ct_get(*pskb, &ctinfo); struct { __be32 ip; __be16 port; @@ -44,7 +44,7 @@ static int set_addr(struct sk_buff **pskb, buf.port = port; addroff += dataoff; - if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_TCP) { if (!nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { @@ -55,11 +55,11 @@ static int set_addr(struct sk_buff **pskb, } /* Relocate data pointer */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, + th = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_tcph), &_tcph); if (th == NULL) return -1; - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + + *data = (*pskb)->data + ip_hdrlen(*pskb) + th->doff * 4 + dataoff; } else { if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, @@ -73,8 +73,8 @@ static int set_addr(struct sk_buff **pskb, /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - sizeof(struct udphdr); + *data = ((*pskb)->data + ip_hdrlen(*pskb) + + sizeof(struct udphdr)); } return 0; @@ -383,7 +383,7 @@ static int nat_h245(struct sk_buff **pskb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct ip_nat_range range; + struct nf_nat_range range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 49a90c39ffc..15b6e5ce3a0 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -87,12 +87,13 @@ static void mangle_contents(struct sk_buff *skb, unsigned char *data; BUG_ON(skb_is_nonlinear(skb)); - data = (unsigned char *)skb->nh.iph + dataoff; + data = skb_network_header(skb) + dataoff; /* move post-replacement */ memmove(data + match_offset + rep_len, data + match_offset + match_len, - skb->tail - (data + match_offset + match_len)); + skb->tail - (skb->network_header + dataoff + + match_offset + match_len)); /* insert data from buffer */ memcpy(data + match_offset, rep_buffer, rep_len); @@ -111,8 +112,8 @@ static void mangle_contents(struct sk_buff *skb, } /* fix IP hdr checksum information */ - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); } /* Unusual, but possible case. */ @@ -152,6 +153,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, const char *rep_buffer, unsigned int rep_len) { + struct rtable *rt = (struct rtable *)(*pskb)->dst; struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -166,7 +168,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, SKB_LINEAR_ASSERT(*pskb); - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); tcph = (void *)iph + iph->ihl*4; oldlen = (*pskb)->len - iph->ihl*4; @@ -175,11 +177,22 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, datalen = (*pskb)->len - iph->ihl*4; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial((char *)tcph, - datalen, 0)); + if (!(rt->rt_flags & RTCF_LOCAL) && + (*pskb)->dev->features & NETIF_F_ALL_CSUM) { + (*pskb)->ip_summed = CHECKSUM_PARTIAL; + (*pskb)->csum_start = skb_headroom(*pskb) + + skb_network_offset(*pskb) + + iph->ihl * 4; + (*pskb)->csum_offset = offsetof(struct tcphdr, check); + tcph->check = ~tcp_v4_check(datalen, + iph->saddr, iph->daddr, 0); + } else { + tcph->check = 0; + tcph->check = tcp_v4_check(datalen, + iph->saddr, iph->daddr, + csum_partial((char *)tcph, + datalen, 0)); + } } else nf_proto_csum_replace2(&tcph->check, *pskb, htons(oldlen), htons(datalen), 1); @@ -190,7 +203,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, (int)rep_len - (int)match_len, ct, ctinfo); /* Tell TCP window tracking about seq change */ - nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, + nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb), ct, CTINFO2DIR(ctinfo)); } return 1; @@ -216,12 +229,13 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, const char *rep_buffer, unsigned int rep_len) { + struct rtable *rt = (struct rtable *)(*pskb)->dst; struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; /* UDP helpers might accidentally mangle the wrong packet */ - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + match_offset + match_len) return 0; @@ -234,7 +248,7 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, !enlarge_skb(pskb, rep_len - match_len)) return 0; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); udph = (void *)iph + iph->ihl*4; oldlen = (*pskb)->len - iph->ihl*4; @@ -250,13 +264,25 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, return 1; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial((char *)udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; + if (!(rt->rt_flags & RTCF_LOCAL) && + (*pskb)->dev->features & NETIF_F_ALL_CSUM) { + (*pskb)->ip_summed = CHECKSUM_PARTIAL; + (*pskb)->csum_start = skb_headroom(*pskb) + + skb_network_offset(*pskb) + + iph->ihl * 4; + (*pskb)->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + 0); + } else { + udph->check = 0; + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + csum_partial((char *)udph, + datalen, 0)); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } } else nf_proto_csum_replace2(&udph->check, *pskb, htons(oldlen), htons(datalen), 1); @@ -318,8 +344,8 @@ nf_nat_sack_adjust(struct sk_buff **pskb, unsigned int dir, optoff, optend; struct nf_conn_nat *nat = nfct_nat(ct); - optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); - optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; + optoff = ip_hdrlen(*pskb) + sizeof(struct tcphdr); + optend = ip_hdrlen(*pskb) + tcph->doff * 4; if (!skb_make_writable(pskb, optend)) return 0; @@ -371,10 +397,10 @@ nf_nat_seq_adjust(struct sk_buff **pskb, this_way = &nat->info.seq[dir]; other_way = &nat->info.seq[!dir]; - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph))) return 0; - tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + tcph = (void *)(*pskb)->data + ip_hdrlen(*pskb); if (after(ntohl(tcph->seq), this_way->correction_pos)) newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); else @@ -399,7 +425,7 @@ nf_nat_seq_adjust(struct sk_buff **pskb, if (!nf_nat_sack_adjust(pskb, tcph, ct, ctinfo)) return 0; - nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, ct, dir); + nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb), ct, dir); return 1; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 7ba341c22ea..a66888749ce 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -53,7 +53,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t; struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; - struct ip_nat_range range; + struct nf_nat_range range; ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 147a4370cf0..2a283397a8b 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -191,7 +191,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb, if (hooknum == NF_IP_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle((*pskb)->nh.iph->daddr, + warn_if_extra_mangle(ip_hdr(*pskb)->daddr, mr->range[0].min_ip); return nf_nat_setup_info(ct, &mr->range[0], hooknum); diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index b12cd7c314c..bfd88e4e068 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/udp.h> #include <net/netfilter/nf_nat.h> @@ -92,7 +93,7 @@ static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, matchoff, matchlen, addr, addrlen)) return 0; - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr); return 1; } @@ -106,7 +107,7 @@ static unsigned int ip_nat_sip(struct sk_buff **pskb, struct addr_map map; int dataoff, datalen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); datalen = (*pskb)->len - dataoff; if (datalen < sizeof("SIP/2.0") - 1) return NF_DROP; @@ -155,7 +156,7 @@ static unsigned int mangle_sip_packet(struct sk_buff **pskb, return 0; /* We need to reload this. Thanks Patrick. */ - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr); return 1; } @@ -168,7 +169,7 @@ static int mangle_content_len(struct sk_buff **pskb, char buffer[sizeof("65536")]; int bufflen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); /* Get actual SDP lenght */ if (ct_sip_get_info(ct, dptr, (*pskb)->len - dataoff, &matchoff, @@ -200,7 +201,7 @@ static unsigned int mangle_sdp(struct sk_buff **pskb, char buffer[sizeof("nnn.nnn.nnn.nnn")]; unsigned int dataoff, bufflen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); /* Mangle owner and contact info. */ bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ce5c4939a6e..6e88505d616 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -38,10 +38,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: James Morris <jmorris@intercode.com.au> - * - * Updates: - * 2000-08-06: Convert to new helper API (Harald Welte). - * */ #include <linux/module.h> #include <linux/moduleparam.h> @@ -1194,7 +1190,7 @@ static int snmp_translate(struct nf_conn *ct, enum ip_conntrack_info ctinfo, struct sk_buff **pskb) { - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); u_int16_t udplen = ntohs(udph->len); u_int16_t paylen = udplen - sizeof(struct udphdr); @@ -1235,7 +1231,7 @@ static int help(struct sk_buff **pskb, unsigned int protoff, { int dir = CTINFO2DIR(ctinfo); unsigned int ret; - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); /* SNMP replies and originating SNMP traps get mangled */ diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 15aa3db8cb3..64bbed2ba78 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -86,8 +86,7 @@ nf_nat_fn(unsigned int hooknum, /* We never see fragments: conntrack defrags on pre-routing and local-out, and nf_nat_out protects post-routing. */ - NF_CT_ASSERT(!((*pskb)->nh.iph->frag_off - & htons(IP_MF|IP_OFFSET))); + NF_CT_ASSERT(!(ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET))); ct = nf_ct_get(*pskb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would @@ -98,11 +97,10 @@ nf_nat_fn(unsigned int hooknum, /* Exception: ICMP redirect to new connection (not in hash table yet). We must not let this through, in case we're doing NAT to the same network. */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) { struct icmphdr _hdr, *hp; - hp = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4, + hp = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_hdr), &_hdr); if (hp != NULL && hp->type == ICMP_REDIRECT) @@ -122,7 +120,7 @@ nf_nat_fn(unsigned int hooknum, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(ct, ctinfo, hooknum, pskb)) return NF_DROP; @@ -177,11 +175,11 @@ nf_nat_in(unsigned int hooknum, int (*okfn)(struct sk_buff *)) { unsigned int ret; - __be32 daddr = (*pskb)->nh.iph->daddr; + __be32 daddr = ip_hdr(*pskb)->daddr; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && - daddr != (*pskb)->nh.iph->daddr) { + daddr != ip_hdr(*pskb)->daddr) { dst_release((*pskb)->dst); (*pskb)->dst = NULL; } @@ -203,7 +201,7 @@ nf_nat_out(unsigned int hooknum, /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || - (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + ip_hdrlen(*pskb) < sizeof(struct iphdr)) return NF_ACCEPT; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); @@ -236,7 +234,7 @@ nf_nat_local_fn(unsigned int hooknum, /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || - (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + ip_hdrlen(*pskb) < sizeof(struct iphdr)) return NF_ACCEPT; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae68a691e8c..37ab5802ca0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -87,19 +87,6 @@ static const struct file_operations sockstat_seq_fops = { .release = single_release, }; -static unsigned long -fold_field(void *mib[], int offt) -{ - unsigned long res = 0; - int i; - - for_each_possible_cpu(i) { - res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); - } - return res; -} - /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), @@ -266,8 +253,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) ip_statistics, - snmp4_ipstats_list[i].entry)); + snmp_fold_field((void **)ip_statistics, + snmp4_ipstats_list[i].entry)); seq_puts(seq, "\nIcmp:"); for (i = 0; snmp4_icmp_list[i].name != NULL; i++) @@ -276,8 +263,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIcmp:"); for (i = 0; snmp4_icmp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) icmp_statistics, - snmp4_icmp_list[i].entry)); + snmp_fold_field((void **)icmp_statistics, + snmp4_icmp_list[i].entry)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) @@ -288,12 +275,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - fold_field((void **) tcp_statistics, - snmp4_tcp_list[i].entry)); + snmp_fold_field((void **)tcp_statistics, + snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - fold_field((void **) tcp_statistics, - snmp4_tcp_list[i].entry)); + snmp_fold_field((void **)tcp_statistics, + snmp4_tcp_list[i].entry)); } seq_puts(seq, "\nUdp:"); @@ -303,8 +290,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) udp_statistics, - snmp4_udp_list[i].entry)); + snmp_fold_field((void **)udp_statistics, + snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); @@ -314,8 +301,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) udplite_statistics, - snmp4_udp_list[i].entry) ); + snmp_fold_field((void **)udplite_statistics, + snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); return 0; @@ -348,8 +335,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) net_statistics, - snmp4_net_list[i].entry)); + snmp_fold_field((void **)net_statistics, + snmp4_net_list[i].entry)); seq_putc(seq, '\n'); return 0; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index da70fef82c9..971ab9356e5 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -45,7 +45,7 @@ #include <net/ipip.h> #include <linux/igmp.h> -struct net_protocol *inet_protos[MAX_INET_PROTOS]; +struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; static DEFINE_SPINLOCK(inet_proto_lock); /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 87e9c161810..24d7c9f3191 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -132,7 +132,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct icmphdr))) return 1; - type = skb->h.icmph->type; + type = icmp_hdr(skb)->type; if (type < 32) { __u32 data = raw_sk(sk)->filter.data; @@ -184,8 +184,8 @@ out: void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int err = 0; int harderr = 0; @@ -256,7 +256,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) } nf_reset(skb); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0; @@ -291,11 +291,13 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; - skb->h.raw = skb->nh.raw; + skb->transport_header = skb->network_header; err = memcpy_fromiovecend((void *)iph, from, 0, length); if (err) goto error_fault; @@ -613,7 +615,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } @@ -887,7 +889,7 @@ static int raw_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations raw_seq_ops = { +static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 37e0d4d5cf9..cb76e3c725a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -82,7 +82,6 @@ #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> @@ -104,6 +103,7 @@ #include <net/xfrm.h> #include <net/ip_mp_alg.h> #include <net/netevent.h> +#include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -364,7 +364,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations rt_cache_seq_ops = { +static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, @@ -470,7 +470,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations rt_cpu_seq_ops = { +static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, @@ -1519,7 +1519,7 @@ static void ipv4_link_failure(struct sk_buff *skb) static int ip_rt_bug(struct sk_buff *skb) { printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", - NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), + NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), skb->dev ? skb->dev->name : "?"); kfree_skb(skb); return 0; @@ -1698,9 +1698,9 @@ static void ip_handle_martian_source(struct net_device *dev, printk(KERN_WARNING "martian source %u.%u.%u.%u from " "%u.%u.%u.%u, on dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); - if (dev->hard_header_len && skb->mac.raw) { + if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; - unsigned char *p = skb->mac.raw; + const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i < dev->hard_header_len; i++, p++) { printk("%02x", *p); @@ -2134,7 +2134,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, - skb->nh.iph->protocol); + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -2396,7 +2396,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) + if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2407,7 +2407,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) of another iface. --ANK */ - if (oldflp->oif == 0 + if (dev_out && oldflp->oif == 0 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface @@ -2683,7 +2683,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, id = rt->peer->ip_id_count; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; - tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; + tsage = get_seconds() - rt->peer->tcp_ts_stamp; } } @@ -2721,7 +2721,7 @@ nla_put_failure: return -EMSGSIZE; } -int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; @@ -2747,10 +2747,11 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ - skb->mac.raw = skb->nh.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ - skb->nh.iph->protocol = IPPROTO_ICMP; + ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; @@ -3193,6 +3194,8 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + return rc; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 33016cc90f0..2da1be0589a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -125,10 +125,11 @@ static __u16 const msstab[] = { __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) { struct tcp_sock *tp = tcp_sk(sk); + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tp->last_synq_overflow = jiffies; /* XXX sort msstab[] by probability? Binary search? */ @@ -138,9 +139,8 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->h.th->source, skb->h.th->dest, - ntohl(skb->h.th->seq), + return secure_tcp_syn_cookie(iph->saddr, iph->daddr, + th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } @@ -157,14 +157,13 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) */ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) { - __u32 seq; - __u32 mssind; - - seq = ntohl(skb->h.th->seq)-1; - mssind = check_tcp_syn_cookie(cookie, - skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->h.th->source, skb->h.th->dest, - seq, jiffies / (HZ * 60), COUNTER_TRIES); + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; } @@ -191,14 +190,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); - __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; int mss; struct rtable *rt; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !skb->h.th->ack) + if (!sysctl_tcp_syncookies || !th->ack) goto out; if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || @@ -220,12 +220,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } ireq = inet_rsk(req); treq = tcp_rsk(req); - treq->rcv_isn = ntohl(skb->h.th->seq) - 1; + treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; - ireq->rmt_port = skb->h.th->source; - ireq->loc_addr = skb->nh.iph->daddr; - ireq->rmt_addr = skb->nh.iph->saddr; + ireq->rmt_port = th->source; + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->opt = NULL; /* We throwed the options of the initial SYN away, so we hope @@ -261,8 +261,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, .tos = RT_CONN_FLAGS(sk) } }, .proto = IPPROTO_TCP, .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } } }; + { .sport = th->dest, + .dport = th->source } } }; security_req_classify_flow(req, &fl); if (ip_route_output_key(&rt, &fl)) { reqsk_free(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0aa304711a9..6817d6485df 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -647,6 +647,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_TCP_FRTO_RESPONSE, + .procname = "tcp_frto_response", + .data = &sysctl_tcp_frto_response, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_TCP_LOW_LATENCY, .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -803,6 +811,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_allowed_congestion_control, .strategy = &strategy_allowed_congestion_control, }, + { + .ctl_name = NET_TCP_MAX_SSTHRESH, + .procname = "tcp_max_ssthresh", + .data = &sysctl_tcp_max_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3834b10b511..d6e48866817 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -297,7 +297,7 @@ EXPORT_SYMBOL(tcp_sockets_allocated); * All the sk_stream_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure; +int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); @@ -425,7 +425,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* Subtract 1, if FIN is in queue. */ if (answ && !skb_queue_empty(&sk->sk_receive_queue)) answ -= - ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; + tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -444,7 +444,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) break; default: return -ENOIOCTLCMD; - }; + } return put_user(answ, (int __user *)arg); } @@ -460,9 +460,9 @@ static inline int forced_push(struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static inline void skb_entail(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); skb->csum = 0; @@ -470,10 +470,8 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, tcb->flags = TCPCB_FLAG_ACK; tcb->sacked = 0; skb_header_release(skb); - __skb_queue_tail(&sk->sk_write_queue, skb); + tcp_add_write_queue_tail(sk, skb); sk_charge_skb(sk, skb); - if (!sk->sk_send_head) - sk->sk_send_head = skb; if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } @@ -488,15 +486,17 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, } } -static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, - int mss_now, int nonagle) +static inline void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle) { - if (sk->sk_send_head) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_send_head(sk)) { + struct sk_buff *skb = tcp_write_queue_tail(sk); if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags, skb); - __tcp_push_pending_frames(sk, tp, mss_now, + __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } } @@ -526,13 +526,13 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse goto do_error; while (psize > 0) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct sk_buff *skb = tcp_write_queue_tail(sk); struct page *page = pages[poffset / PAGE_SIZE]; int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -542,7 +542,7 @@ new_segment: if (!skb) goto wait_for_memory; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -588,8 +588,8 @@ new_segment: if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -597,7 +597,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -608,7 +608,7 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); return copied; do_error: @@ -639,8 +639,9 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk, struct tcp_sock *tp) +static inline int select_size(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -704,9 +705,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, while (seglen > 0) { int copy; - skb = sk->sk_write_queue.prev; + skb = tcp_write_queue_tail(sk); - if (!sk->sk_send_head || + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: @@ -716,7 +717,7 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), + skb = sk_stream_alloc_pskb(sk, select_size(sk), 0, sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -727,7 +728,7 @@ new_segment: if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -832,8 +833,8 @@ new_segment: if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -841,7 +842,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -853,16 +854,18 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); TCP_CHECK_TIMER(sk); release_sock(sk); return copied; do_fault: if (!skb->len) { - if (sk->sk_send_head == skb) - sk->sk_send_head = NULL; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); + /* It is the one place in all of TCP, except connection + * reset, where we can be unlinking the send_head. + */ + tcp_check_send_head(sk, skb); sk_stream_free_skb(sk, skb); } @@ -1016,9 +1019,9 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) skb_queue_walk(&sk->sk_receive_queue, skb) { offset = seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; - if (offset < skb->len || skb->h.th->fin) { + if (offset < skb->len || tcp_hdr(skb)->fin) { *off = offset; return skb; } @@ -1070,7 +1073,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset != skb->len) break; } - if (skb->h.th->fin) { + if (tcp_hdr(skb)->fin) { sk_eat_skb(sk, skb, 0); ++seq; break; @@ -1174,11 +1177,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; } offset = *seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; if (offset < skb->len) goto found_ok_skb; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; BUG_TRAP(flags & MSG_PEEK); skb = skb->next; @@ -1389,12 +1392,12 @@ do_prequeue: skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { tp->urg_data = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); } if (used + offset < skb->len) continue; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); @@ -1563,21 +1566,19 @@ void tcp_close(struct sock *sk, long timeout) */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - skb->h.th->fin; + tcp_hdr(skb)->fin; data_was_unread += len; __kfree_skb(skb); } sk_stream_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ @@ -1732,7 +1733,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - sk_stream_writequeue_purge(sk); + tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); #ifdef CONFIG_NET_DMA __skb_queue_purge(&sk->sk_async_wait_queue); @@ -1758,7 +1759,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); - sk->sk_send_head = NULL; + tcp_init_send_head(sk); tp->rx_opt.saw_tstamp = 0; tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); @@ -1830,7 +1831,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, * for currently queued segments. */ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } else { tp->nonagle &= ~TCP_NAGLE_OFF; } @@ -1854,7 +1855,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle&TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } break; @@ -1954,7 +1955,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, default: err = -ENOPROTOOPT; break; - }; + } + release_sock(sk); return err; } @@ -2124,7 +2126,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; default: return -ENOPROTOOPT; - }; + } if (put_user(len, optlen)) return -EFAULT; @@ -2170,7 +2172,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) if (!pskb_may_pull(skb, sizeof(*th))) goto out; - th = skb->h.th; + th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; @@ -2210,7 +2212,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) delta = htonl(oldlen + (thlen + len)); skb = segs; - th = skb->h.th; + th = tcp_hdr(skb); seq = ntohl(th->seq); do { @@ -2219,23 +2221,25 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = + csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); seq += len; skb = skb->next; - th = skb->h.th; + th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } while (skb->next); - delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); + delta = htonl(oldlen + (skb->tail - skb->transport_header) + + skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); out: return segs; @@ -2372,6 +2376,23 @@ void __tcp_put_md5sig_pool(void) EXPORT_SYMBOL(__tcp_put_md5sig_pool); #endif +void tcp_done(struct sock *sk) +{ + if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_done); + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5730333cd0a..281c9f91325 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -206,7 +206,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 5c8caf4a124..86b26539e54 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -12,6 +12,8 @@ #include <linux/list.h> #include <net/tcp.h> +int sysctl_tcp_max_ssthresh = 0; + static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); @@ -77,18 +79,19 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) - return; + /* if no choice made yet assign the current value set as default */ + if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + icsk->icsk_ca_ops = ca; + break; + } - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; + /* fallback to next available */ } - + rcu_read_unlock(); } - rcu_read_unlock(); if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -123,7 +126,7 @@ int tcp_set_default_congestion_control(const char *name) #endif if (ca) { - ca->non_restricted = 1; /* default is always allowed */ + ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ list_move(&ca->list, &tcp_cong_list); ret = 0; } @@ -178,7 +181,7 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (!ca->non_restricted) + if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", @@ -209,16 +212,16 @@ int tcp_set_allowed_congestion_control(char *val) } } - /* pass 2 clear */ + /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) - ca->non_restricted = 0; + ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) - ca->non_restricted = 1; + ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); @@ -236,6 +239,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = tcp_ca_find(name); + /* no change asking for existing value */ if (ca == icsk->icsk_ca_ops) goto out; @@ -252,7 +256,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (!ca) err = -ENOENT; - else if (!(ca->non_restricted || capable(CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) @@ -261,7 +265,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) else { tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; - if (icsk->icsk_ca_ops->init) + + if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); } out: @@ -271,10 +276,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) /* - * Linear increase during slow start + * Slow start (exponential increase) with + * RFC3742 Limited Slow Start (fast linear increase) support. */ void tcp_slow_start(struct tcp_sock *tp) { + int cnt = 0; + if (sysctl_tcp_abc) { /* RFC3465: Slow Start * TCP sender SHOULD increase cwnd by the number of @@ -283,17 +291,25 @@ void tcp_slow_start(struct tcp_sock *tp) */ if (tp->bytes_acked < tp->mss_cache) return; - - /* We MAY increase by 2 if discovered delayed ack */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } } + + if (sysctl_tcp_max_ssthresh > 0 && + tp->snd_cwnd > sysctl_tcp_max_ssthresh) + cnt += sysctl_tcp_max_ssthresh>>1; + else + cnt += tp->snd_cwnd; + + /* RFC3465: We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) + cnt <<= 1; tp->bytes_acked = 0; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; + tp->snd_cwnd_cnt += cnt; + while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd_cnt -= tp->snd_cwnd; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -355,8 +371,8 @@ u32 tcp_reno_min_cwnd(const struct sock *sk) EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); struct tcp_congestion_ops tcp_reno = { + .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", - .non_restricted = 1, .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 9a582fb4ef9..14224487b16 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -1,5 +1,5 @@ /* - * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * TCP CUBIC: Binary Increase Congestion control for TCP v2.1 * * This is from the implementation of CUBIC TCP in * Injong Rhee, Lisong Xu. @@ -51,8 +51,6 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_ module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); -#include <asm/div64.h> - /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ @@ -93,50 +91,51 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; - - return dividend; -} - -/* - * calculate the cubic root of x using Newton-Raphson +/* calculate the cubic root of x using a table lookup followed by one + * Newton-Raphson iteration. + * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { - u32 x, x1; - - /* Initial estimate is based on: - * cbrt(x) = exp(log(x) / 3) + u32 x, b, shift; + /* + * cbrt(x) MSB values for x MSB values in [0..63]. + * Precomputed then refined by hand - Willy Tarreau + * + * For x in [0..63], + * v = cbrt(x << 18) - 1 + * cbrt(x) = (v[x] + 10) >> 6 */ - x = 1u << (fls64(a)/3); + static const u8 v[] = { + /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, + /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, + /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, + /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, + /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, + /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, + /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, + /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, + }; + + b = fls64(a); + if (b < 7) { + /* a in [0..63] */ + return ((u32)v[(u32)a] + 35) >> 6; + } + + b = ((b * 84) >> 8) - 1; + shift = (a >> (b * 3)); + + x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; /* - * Iteration based on: + * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ - do { - x1 = x; - x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; - } while (abs(x1 - x) > 1); - + x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1))); + x = ((x * 341) >> 10); return x; } @@ -215,7 +214,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->delay_min > 0) { /* max increment = Smax * rtt / 0.1 */ min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); - if (ca->cnt < min_cnt) + + /* use concave growth when the target is above the origin */ + if (ca->cnt < min_cnt && t >= ca->bic_K) ca->cnt = min_cnt; } @@ -333,7 +334,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -401,4 +402,4 @@ module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); -MODULE_VERSION("2.0"); +MODULE_VERSION("2.1"); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1020eb48d8d..4ba4a7ae0a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -98,7 +98,7 @@ static inline void measure_rtt(struct sock *sk) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 59e691d26f6..e5be3511722 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -144,7 +144,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->snd_cwnd_cents += odd; /* check when fractions goes >=128 and increase cwnd by 1. */ - while(ca->snd_cwnd_cents >= 128) { + while (ca->snd_cwnd_cents >= 128) { tp->snd_cwnd++; ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c new file mode 100644 index 00000000000..4adc47c5535 --- /dev/null +++ b/net/ipv4/tcp_illinois.c @@ -0,0 +1,356 @@ +/* + * TCP Illinois congestion control. + * Home page: + * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + * + * The algorithm is described in: + * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm + * for High-Speed Networks" + * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf + * + * Implemented from description in paper and ns-2 simulation. + * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> +#include <asm/div64.h> +#include <net/tcp.h> + +#define ALPHA_SHIFT 7 +#define ALPHA_SCALE (1u<<ALPHA_SHIFT) +#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ +#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ +#define ALPHA_BASE ALPHA_SCALE /* 1.0 */ +#define U32_MAX ((u32)~0U) +#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ + +#define BETA_SHIFT 6 +#define BETA_SCALE (1u<<BETA_SHIFT) +#define BETA_MIN (BETA_SCALE/8) /* 0.125 */ +#define BETA_MAX (BETA_SCALE/2) /* 0.5 */ +#define BETA_BASE BETA_MAX + +static int win_thresh __read_mostly = 15; +module_param(win_thresh, int, 0); +MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing"); + +static int theta __read_mostly = 5; +module_param(theta, int, 0); +MODULE_PARM_DESC(theta, "# of fast RTT's before full growth"); + +/* TCP Illinois Parameters */ +struct illinois { + u64 sum_rtt; /* sum of rtt's measured within last rtt */ + u16 cnt_rtt; /* # of rtts measured within last rtt */ + u32 base_rtt; /* min of all rtt in usec */ + u32 max_rtt; /* max of all rtt in usec */ + u32 end_seq; /* right edge of current RTT */ + u32 alpha; /* Additive increase */ + u32 beta; /* Muliplicative decrease */ + u16 acked; /* # packets acked by current ACK */ + u8 rtt_above; /* average rtt has gone above threshold */ + u8 rtt_low; /* # of rtts measurements below threshold */ +}; + +static void rtt_reset(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + ca->end_seq = tp->snd_nxt; + ca->cnt_rtt = 0; + ca->sum_rtt = 0; + + /* TODO: age max_rtt? */ +} + +static void tcp_illinois_init(struct sock *sk) +{ + struct illinois *ca = inet_csk_ca(sk); + + ca->alpha = ALPHA_MAX; + ca->beta = BETA_BASE; + ca->base_rtt = 0x7fffffff; + ca->max_rtt = 0; + + ca->acked = 0; + ca->rtt_low = 0; + ca->rtt_above = 0; + + rtt_reset(sk); +} + +/* Measure RTT for each ack. */ +static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +{ + struct illinois *ca = inet_csk_ca(sk); + u32 rtt; + + ca->acked = pkts_acked; + + rtt = ktime_to_us(net_timedelta(last)); + + /* ignore bogus values, this prevents wraparound in alpha math */ + if (rtt > RTT_MAX) + rtt = RTT_MAX; + + /* keep track of minimum RTT seen so far */ + if (ca->base_rtt > rtt) + ca->base_rtt = rtt; + + /* and max */ + if (ca->max_rtt < rtt) + ca->max_rtt = rtt; + + ++ca->cnt_rtt; + ca->sum_rtt += rtt; +} + +/* Maximum queuing delay */ +static inline u32 max_delay(const struct illinois *ca) +{ + return ca->max_rtt - ca->base_rtt; +} + +/* Average queuing delay */ +static inline u32 avg_delay(const struct illinois *ca) +{ + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + return t - ca->base_rtt; +} + +/* + * Compute value of alpha used for additive increase. + * If small window then use 1.0, equivalent to Reno. + * + * For larger windows, adjust based on average delay. + * A. If average delay is at minimum (we are uncongested), + * then use large alpha (10.0) to increase faster. + * B. If average delay is at maximum (getting congested) + * then use small alpha (0.3) + * + * The result is a convex window growth curve. + */ +static u32 alpha(struct illinois *ca, u32 da, u32 dm) +{ + u32 d1 = dm / 100; /* Low threshold */ + + if (da <= d1) { + /* If never got out of low delay zone, then use max */ + if (!ca->rtt_above) + return ALPHA_MAX; + + /* Wait for 5 good RTT's before allowing alpha to go alpha max. + * This prevents one good RTT from causing sudden window increase. + */ + if (++ca->rtt_low < theta) + return ca->alpha; + + ca->rtt_low = 0; + ca->rtt_above = 0; + return ALPHA_MAX; + } + + ca->rtt_above = 1; + + /* + * Based on: + * + * (dm - d1) amin amax + * k1 = ------------------- + * amax - amin + * + * (dm - d1) amin + * k2 = ---------------- - d1 + * amax - amin + * + * k1 + * alpha = ---------- + * k2 + da + */ + + dm -= d1; + da -= d1; + return (dm * ALPHA_MAX) / + (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN); +} + +/* + * Beta used for multiplicative decrease. + * For small window sizes returns same value as Reno (0.5) + * + * If delay is small (10% of max) then beta = 1/8 + * If delay is up to 80% of max then beta = 1/2 + * In between is a linear function + */ +static u32 beta(u32 da, u32 dm) +{ + u32 d2, d3; + + d2 = dm / 10; + if (da <= d2) + return BETA_MIN; + + d3 = (8 * dm) / 10; + if (da >= d3 || d3 <= d2) + return BETA_MAX; + + /* + * Based on: + * + * bmin d3 - bmax d2 + * k3 = ------------------- + * d3 - d2 + * + * bmax - bmin + * k4 = ------------- + * d3 - d2 + * + * b = k3 + k4 da + */ + return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da) + / (d3 - d2); +} + +/* Update alpha and beta values once per RTT */ +static void update_params(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (tp->snd_cwnd < win_thresh) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + } else if (ca->cnt_rtt > 0) { + u32 dm = max_delay(ca); + u32 da = avg_delay(ca); + + ca->alpha = alpha(ca, da, dm); + ca->beta = beta(da, dm); + } + + rtt_reset(sk); +} + +/* + * In case of loss, reset to default values + */ +static void tcp_illinois_state(struct sock *sk, u8 new_state) +{ + struct illinois *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + ca->rtt_low = 0; + ca->rtt_above = 0; + rtt_reset(sk); + } +} + +/* + * Increase window in response to successful acknowledgment. + */ +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (after(ack, ca->end_seq)) + update_params(sk); + + /* RFC2861 only increase cwnd if fully utilized */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* In slow start */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else { + u32 delta; + + /* snd_cwnd_cnt is # of packets since last cwnd increment */ + tp->snd_cwnd_cnt += ca->acked; + ca->acked = 1; + + /* This is close approximation of: + * tp->snd_cwnd += alpha/tp->snd_cwnd + */ + delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; + if (delta >= tp->snd_cwnd) { + tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, + (u32) tp->snd_cwnd_clamp); + tp->snd_cwnd_cnt = 0; + } + } +} + +static u32 tcp_illinois_ssthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + /* Multiplicative decrease */ + return max((tp->snd_cwnd * ca->beta) >> BETA_SHIFT, 2U); +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_illinois_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct illinois *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rttcnt = ca->cnt_rtt, + .tcpv_minrtt = ca->base_rtt, + }; + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + info.tcpv_rtt = t; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops tcp_illinois = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_illinois_init, + .ssthresh = tcp_illinois_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = tcp_illinois_cong_avoid, + .set_state = tcp_illinois_state, + .get_info = tcp_illinois_info, + .pkts_acked = tcp_illinois_acked, + + .owner = THIS_MODULE, + .name = "illinois", +}; + +static int __init tcp_illinois_register(void) +{ + BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_illinois); +} + +static void __exit tcp_illinois_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_illinois); +} + +module_init(tcp_illinois_register); +module_exit(tcp_illinois_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Shao Liu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Illinois"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a14191687a..7641b2761a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; @@ -100,6 +101,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -110,6 +112,8 @@ int sysctl_tcp_abc __read_mostly; #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define IsSackFrto() (sysctl_tcp_frto == 0x2) + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) /* Adapt the MSS value used to make delayed ack decision to the @@ -136,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * * "len" is invariant segment length, including TCP header. */ - len += skb->data - skb->h.raw; + len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. @@ -144,7 +148,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && - !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. @@ -231,9 +235,9 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, - const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; @@ -248,9 +252,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && @@ -263,7 +269,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2*tp->advmss; else - incr = __tcp_grow_window(sk, tp, skb); + incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); @@ -326,8 +332,9 @@ static void tcp_init_buffer_space(struct sock *sk) } /* 5. Recalculate window clamp after socket hit its memory bounds. */ -static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +static void tcp_clamp_window(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; @@ -499,8 +506,9 @@ new_measure: * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; @@ -541,7 +549,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) - tcp_grow_window(sk, tp, skb); + tcp_grow_window(sk, skb); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -574,7 +582,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if(m == 0) + if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ @@ -759,15 +767,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) } /* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk) +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + if (set_ssthresh) + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -934,7 +944,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; @@ -1038,7 +1049,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cached_skb = tp->fastpath_skb_hint; cached_fack_count = tp->fastpath_cnt_hint; if (!cached_skb) { - cached_skb = sk->sk_write_queue.next; + cached_skb = tcp_write_queue_head(sk); cached_fack_count = 0; } @@ -1055,10 +1066,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + if (skb == tcp_send_head(sk)) + break; + cached_skb = skb; cached_fack_count = fack_count; if (i == first_sack_index) { @@ -1159,6 +1173,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* clear lost hint */ tp->retransmit_skb_hint = NULL; } + /* SACK enhanced F-RTO detection. + * Set flag if and only if non-rexmitted + * segments below frto_highmark are + * SACKed (RFC4138; Appendix B). + * Clearing correct due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) { + flag &= ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked & TCPCB_RETRANS)) + flag |= FLAG_ONLY_ORIG_SACKED; + } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -1195,7 +1221,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) @@ -1224,7 +1252,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 @@ -1236,9 +1265,49 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* RTO occurred, but do not yet enter loss state. Instead, transmit two new - * segments to see from the next ACKs whether any data was really missing. - * If the RTO was spurious, new ACKs should arrive. +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ +int tcp_use_frto(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!sysctl_tcp_frto) + return 0; + + if (IsSackFrto()) + return 1; + + /* Avoid expensive walking of rexmit queue if possible */ + if (tp->retrans_out > 1) + return 0; + + skb = tcp_write_queue_head(sk); + skb = tcp_write_queue_next(sk, skb); /* Skips head */ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + return 0; + /* Short-circuit when first non-SACKed skb has been checked */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + break; + } + return 1; +} + +/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO + * recovery a bit and use heuristics in tcp_process_frto() to detect if + * the RTO was spurious. Only clear SACKED_RETRANS of the head here to + * keep retrans_out counting accurate (with SACK F-RTO, other than head + * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS + * bits are handled if the Loss state is really to be entered (in + * tcp_enter_frto_loss). + * + * Do like tcp_enter_loss() would; when RTO expires the second time it + * does: + * "Reduce ssthresh if it has not yet been made inside this window." */ void tcp_enter_frto(struct sock *sk) { @@ -1246,39 +1315,69 @@ void tcp_enter_frto(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - tp->frto_counter = 1; - - if (icsk->icsk_ca_state <= TCP_CA_Disorder || + if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || tp->snd_una == tp->high_seq || - (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && + !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + /* Our state is too optimistic in ssthresh() call because cwnd + * is not reduced until tcp_enter_frto_loss() when previous FRTO + * recovery has not yet completed. Pattern would be this: RTO, + * Cumulative ACK, RTO (2xRTO for the same segment does not end + * up here twice). + * RFC4138 should be more specific on what to do, even though + * RTO is quite unlikely to occur after the first Cumulative ACK + * due to back-off and complexity of triggering events ... + */ + if (tp->frto_counter) { + u32 stored_cwnd; + stored_cwnd = tp->snd_cwnd; + tp->snd_cwnd = 2; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = stored_cwnd; + } else { + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + } + /* ... in theory, cong.control module could do "any tricks" in + * ssthresh(), which means that ca_state, lost bits and lost_out + * counter would have to be faked before the call occurs. We + * consider that too expensive, unlikely and hacky, so modules + * using these in ssthresh() must deal these incompatibility + * issues if they receives CA_EVENT_FRTO and frto_counter != 0 + */ tcp_ca_event(sk, CA_EVENT_FRTO); } - /* Have to clear retransmission markers here to keep the bookkeeping - * in shape, even though we are not yet in Loss state. - * If something was really lost, it is eventually caught up - * in tcp_enter_frto_loss. - */ - tp->retrans_out = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - sk_stream_for_retrans_queue(skb, sk) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); } tcp_sync_left_out(tp); - tcp_set_ca_state(sk, TCP_CA_Open); - tp->frto_highmark = tp->snd_nxt; + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (IsSackFrto() && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } + tcp_set_ca_state(sk, TCP_CA_Disorder); + tp->high_seq = tp->snd_nxt; + tp->frto_counter = 1; } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ -static void tcp_enter_frto_loss(struct sock *sk) +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1287,10 +1386,23 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->sacked_out = 0; tp->lost_out = 0; tp->fackets_out = 0; + tp->retrans_out = 0; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + /* + * Count the retransmission made on RTO correctly (only when + * waiting for the first ACK and did not get it)... + */ + if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + tp->retrans_out += tcp_skb_pcount(skb); + /* ...enter this if branch just for the first segment */ + flag |= FLAG_DATA_ACKED; + } else { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { /* Do not mark those segments lost that were @@ -1308,7 +1420,7 @@ static void tcp_enter_frto_loss(struct sock *sk) } tcp_sync_left_out(tp); - tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -1366,7 +1478,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (!how) tp->undo_marker = tp->snd_una; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; @@ -1401,14 +1515,14 @@ static int tcp_check_sack_reneging(struct sock *sk) * receiver _host_ is heavily congested (or buggy). * Do processing similar to RTO timeout. */ - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + if ((skb = tcp_write_queue_head(sk)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; @@ -1426,10 +1540,12 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } -static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +static inline int tcp_head_timedout(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + return tp->packets_out && - tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1525,10 +1641,15 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +static int tcp_time_to_recover(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + /* Do not perform any recovery during FRTO algorithm */ + if (tp->frto_counter) + return 0; + /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; @@ -1540,7 +1661,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk, tp)) + if (tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -1549,7 +1670,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) packets_out = tp->packets_out; if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && - !tcp_may_send_now(sk, tp)) { + !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ @@ -1589,8 +1710,10 @@ static void tcp_add_reno_sack(struct sock *sk) /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); + if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ if (acked-1 >= tp->sacked_out) @@ -1609,9 +1732,10 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) } /* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, +static void tcp_mark_head_lost(struct sock *sk, int packets, u32 high_seq) { + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt; @@ -1620,11 +1744,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; } else { - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -1638,12 +1764,11 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* clear xmit_retransmit_queue hints * if this is beyond hint */ - if(tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { - + if (tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; - } + } } tcp_sync_left_out(tp); @@ -1651,15 +1776,17 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +static void tcp_update_scoreboard(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (IsFack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + tcp_mark_head_lost(sk, lost, tp->high_seq); } else { - tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + tcp_mark_head_lost(sk, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched @@ -1667,13 +1794,15 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : sk->sk_write_queue.next; + : tcp_write_queue_head(sk); - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (!tcp_skb_timedout(sk, skb)) break; @@ -1745,9 +1874,11 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp) /* Undo procedures. */ #if FASTRETRANS_DEBUG > 1 -static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +static void DBGUNDO(struct sock *sk, const char *msg) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), @@ -1793,13 +1924,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp) } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_recovery(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); @@ -1819,10 +1952,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +static void tcp_try_undo_dsack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->undo_marker && !tp->undo_retrans) { - DBGUNDO(sk, tp, "D-SACK"); + DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); @@ -1831,9 +1966,9 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) /* Undo during fast recovery after partial ACK. */ -static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, - int acked) +static int tcp_try_undo_partial(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ int failed = IsReno(tp) || tp->fackets_out>tp->reordering; @@ -1846,7 +1981,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - DBGUNDO(sk, tp, "Hoe"); + DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); @@ -1860,17 +1995,21 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_loss(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } clear_all_retrans_hints(tp); - DBGUNDO(sk, tp, "partial loss"); + DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); @@ -1892,15 +2031,17 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +static void tcp_try_to_open(struct sock *sk, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + tp->left_out = tp->sacked_out; if (tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -1987,7 +2128,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -1997,14 +2138,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - if (!sysctl_tcp_frto) - BUG_TRAP(tp->retrans_out == 0); + BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { case TCP_CA_Loss: icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; break; @@ -2018,7 +2158,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Disorder: - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ @@ -2031,7 +2171,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, case TCP_CA_Recovery: if (IsReno(tp)) tcp_reset_reno_sack(tp); - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); break; @@ -2047,14 +2187,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, acked); - is_dupack = tcp_try_undo_partial(sk, tp, acked); + tcp_remove_reno_sacks(sk, acked); + is_dupack = tcp_try_undo_partial(sk, acked); } break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; - if (!tcp_try_undo_loss(sk, tp)) { + if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; @@ -2071,10 +2211,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } if (icsk->icsk_ca_state == TCP_CA_Disorder) - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk, tp)) { - tcp_try_to_open(sk, tp, flag); + if (!tcp_time_to_recover(sk)) { + tcp_try_to_open(sk, flag); return; } @@ -2113,8 +2253,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_set_ca_state(sk, TCP_CA_Recovery); } - if (is_dupack || tcp_head_timedout(sk, tp)) - tcp_update_scoreboard(sk, tp); + if (is_dupack || tcp_head_timedout(sk)) + tcp_update_scoreboard(sk); tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -2190,8 +2330,10 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { @@ -2255,14 +2397,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static u32 tcp_usrtt(struct timeval *tv) -{ - struct timeval now; - - do_gettimeofday(&now); - return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); -} - /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2273,12 +2407,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; __s32 seq_rtt = -1; u32 pkts_acked = 0; - void (*rtt_sample)(struct sock *sk, u32 usrtt) - = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; + ktime_t last_ackt = ktime_set(0,0); - while ((skb = skb_peek(&sk->sk_write_queue)) && - skb != sk->sk_send_head) { + while ((skb = tcp_write_queue_head(sk)) && + skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -2318,13 +2450,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) if (sacked) { if (sacked & TCPCB_RETRANS) { - if(sacked & TCPCB_SACKED_RETRANS) + if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2337,23 +2469,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk, tp); - if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) - (*rtt_sample)(sk, tcp_usrtt(&tv)); + tcp_ack_packets_out(sk); - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, last_ackt); } #if FASTRETRANS_DEBUG > 0 @@ -2390,7 +2523,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); @@ -2433,13 +2566,14 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb, u32 ack, u32 ack_seq) +static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, + u32 ack_seq) { + struct tcp_sock *tp = tcp_sk(sk); int flag = 0; - u32 nwin = ntohs(skb->h.th->window); + u32 nwin = ntohs(tcp_hdr(skb)->window); - if (likely(!skb->h.th->syn)) + if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { @@ -2453,7 +2587,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, * fast path is recovered for sending TCP. */ tp->pred_flags = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; @@ -2467,39 +2601,139 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, return flag; } -static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +/* A very conservative spurious RTO response algorithm: reduce cwnd and + * continue in congestion avoidance. + */ +static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; + tcp_moderate_cwnd(tp); +} + +/* A conservative spurious RTO response algorithm: reduce cwnd using + * rate halving and continue in congestion avoidance. + */ +static void tcp_ratehalving_spur_to_response(struct sock *sk) +{ + tcp_enter_cwr(sk, 0); +} + +static void tcp_undo_spur_to_response(struct sock *sk, int flag) +{ + if (flag&FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, 1); +} + +/* F-RTO spurious RTO detection algorithm (RFC4138) + * + * F-RTO affects during two new ACKs following RTO (well, almost, see inline + * comments). State (ACK number) is kept in frto_counter. When ACK advances + * window (but not to or beyond highest sequence sent before RTO): + * On First ACK, send two new segments out. + * On Second ACK, RTO was likely spurious. Do spurious response (response + * algorithm is not part of the F-RTO detection algorithm + * given in RFC4138 but can be selected separately). + * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. + * + * Rationale: if the RTO was spurious, new ACKs should arrive from the + * original window even after we transmit two new data segments. + * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * + * F-RTO is implemented (mainly) in four functions: + * - tcp_use_frto() is used to determine if TCP is can use F-RTO + * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is + * called when tcp_use_frto() showed green light + * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm + * - tcp_enter_frto_loss() is called if there is not enough evidence + * to prove that the RTO is indeed spurious. It transfers the control + * from F-RTO to the conventional RTO recovery + */ +static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_sync_left_out(tp); - if (tp->snd_una == prior_snd_una || - !before(tp->snd_una, tp->frto_highmark)) { - /* RTO was caused by loss, start retransmitting in - * go-back-N slow start + /* Duplicate the behavior from Loss state (fastretrans_alert) */ + if (flag&FLAG_DATA_ACKED) + inet_csk(sk)->icsk_retransmits = 0; + + if (!before(tp->snd_una, tp->frto_highmark)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); + return 1; + } + + if (!IsSackFrto() || IsReno(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate */ - tcp_enter_frto_loss(sk); - return; + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return 1; + + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter >= 2) && + (!(flag&FLAG_FORWARD_PROGRESS) || + ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } } if (tp->frto_counter == 1) { - /* First ACK after RTO advances the window: allow two new - * segments out. - */ + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->frto_counter = 2; + return 1; } else { - /* Also the second ACK after RTO advances the window. - * The RTO was likely spurious. Reduce cwnd and continue - * in congestion avoidance - */ - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tcp_moderate_cwnd(tp); + switch (sysctl_tcp_frto_response) { + case 2: + tcp_undo_spur_to_response(sk, flag); + break; + case 1: + tcp_conservative_spur_to_response(tp); + break; + default: + tcp_ratehalving_spur_to_response(sk); + break; + } + tp->frto_counter = 0; } - - /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavior is back to normal. - */ - tp->frto_counter = (tp->frto_counter + 1) % 3; + return 0; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -2513,6 +2747,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 prior_in_flight; s32 seq_rtt; int prior_packets; + int frto_cwnd = 0; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -2549,12 +2784,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) else NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; tcp_ca_event(sk, CA_EVENT_SLOW_ACK); @@ -2575,15 +2810,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - tcp_process_frto(sk, prior_snd_una); + frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && + tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } @@ -2599,7 +2835,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (sk->sk_send_head) + if (tcp_send_head(sk)) tcp_ack_probe(sk); return 1; @@ -2620,13 +2856,13 @@ uninteresting_ack: void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) { unsigned char *ptr; - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; - while(length>0) { + while (length > 0) { int opcode=*ptr++; int opsize; @@ -2642,9 +2878,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, return; if (opsize > length) return; /* don't parse partial options */ - switch(opcode) { + switch (opcode) { case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn && !estab) { + if (opsize==TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) @@ -2654,12 +2890,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (opsize==TCPOLEN_WINDOW && th->syn && !estab) if (sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *) ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if(net_ratelimit()) + if (net_ratelimit()) printk(KERN_INFO "tcp_parse_options: Illegal window " "scaling value %d >14 received.\n", snd_wscale); @@ -2669,7 +2905,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { + if (opsize==TCPOLEN_TIMESTAMP) { if ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps)) { opt_rx->saw_tstamp = 1; @@ -2679,7 +2915,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { if (sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); @@ -2688,7 +2924,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; @@ -2701,10 +2937,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - }; + } + ptr+=opsize-2; length-=opsize; - }; + } } } @@ -2737,7 +2974,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = xtime.tv_sec; + tp->rx_opt.ts_recent_stamp = get_seconds(); } static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -2750,8 +2987,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } } @@ -2782,7 +3019,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2803,7 +3040,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff * { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && !tcp_disordered_ack(sk, skb)); } @@ -2910,7 +3147,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", __FUNCTION__, sk->sk_state); break; - }; + } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. @@ -3009,7 +3246,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) */ tp->rx_opt.num_sacks--; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + for (i=this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i+1]; continue; } @@ -3062,7 +3299,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.num_sacks--; sp--; } - for(; this_sack > 0; this_sack--, sp--) + for (; this_sack > 0; this_sack--, sp--) *sp = *(sp-1); new_sack: @@ -3088,7 +3325,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for(this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks; ) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3144,8 +3381,8 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); + if (tcp_hdr(skb)->fin) + tcp_fin(skb, sk, tcp_hdr(skb)); } } @@ -3153,7 +3390,7 @@ static int tcp_prune_queue(struct sock *sk); static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; @@ -3210,9 +3447,9 @@ queue_and_out: __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->len) - tcp_event_data_recv(sk, tp, skb); - if(th->fin) + if (skb->len) + tcp_event_data_recv(sk, skb); + if (th->fin) tcp_fin(skb, sk, th); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -3228,7 +3465,7 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (eaten > 0) __kfree_skb(skb); @@ -3392,7 +3629,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!skb->h.th->syn && !skb->h.th->fin && + if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start) || (skb->next != tail && @@ -3403,7 +3640,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start = TCP_SKB_CB(skb)->end_seq; skb = skb->next; } - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -3419,11 +3656,14 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); + skb_set_network_header(nskb, (skb_network_header(skb) - + skb->head)); + skb_set_transport_header(nskb, (skb_transport_header(skb) - + skb->head)); skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); - nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); - nskb->h.raw = nskb->head + (skb->h.raw-skb->head); - nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); @@ -3449,7 +3689,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || + tcp_hdr(skb)->syn || + tcp_hdr(skb)->fin) return; } } @@ -3514,7 +3756,7 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - tcp_clamp_window(sk, tp); + tcp_clamp_window(sk); else if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -3583,8 +3825,10 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + /* If the user specified a specific send buffer setting, do * not modify it. */ @@ -3616,7 +3860,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_should_expand_sndbuf(sk, tp)) { + if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, @@ -3640,9 +3884,9 @@ static void tcp_check_space(struct sock *sk) } } -static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk) { - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); tcp_check_space(sk); } @@ -3790,7 +4034,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int err; local_bh_enable(); - if (skb->ip_summed==CHECKSUM_UNNECESSARY) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); else err = skb_copy_and_csum_datagram_iovec(skb, hlen, @@ -3822,7 +4066,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { - return skb->ip_summed != CHECKSUM_UNNECESSARY && + return !skb_csum_unnecessary(skb) && __tcp_checksum_complete_user(sk, skb); } @@ -3840,7 +4084,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = get_softnet_dma(); - if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); @@ -3856,7 +4100,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen tcp_rcv_space_adjust(sk); if ((tp->ucopy.len == 0) || - (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; sk->sk_data_ready(sk, 0); @@ -3976,7 +4220,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -4047,12 +4291,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } - tcp_event_data_recv(sk, tp, skb); + tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -4109,7 +4353,7 @@ slow_path: goto discard; } - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4124,7 +4368,7 @@ slow_path: } step5: - if(th->ack) + if (th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); tcp_rcv_rtt_measure_ts(sk, skb); @@ -4135,7 +4379,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; @@ -4412,13 +4656,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_LISTEN: - if(th->ack) + if (th->ack) return 1; - if(th->rst) + if (th->rst) goto discard; - if(th->syn) { + if (th->syn) { if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; @@ -4452,7 +4696,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } @@ -4474,7 +4718,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } /* step 2: check RST bit */ - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4497,7 +4741,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); - switch(sk->sk_state) { + switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; @@ -4644,7 +4888,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ba74bbe7d3..5a3e7f839fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,7 +88,7 @@ int sysctl_tcp_low_latency __read_mostly; #define ICMP_MIN_LENGTH 8 /* Socket used for sending RSTs */ -static struct socket *tcp_socket; +static struct socket *tcp_socket __read_mostly; void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); @@ -125,10 +125,10 @@ void tcp_unhash(struct sock *sk) static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) { - return secure_tcp_sequence_number(skb->nh.iph->daddr, - skb->nh.iph->saddr, - skb->h.th->dest, - skb->h.th->source); + return secure_tcp_sequence_number(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -149,7 +149,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) */ if (tcptw->tw_ts_recent_stamp && (twp == NULL || (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -224,7 +224,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } @@ -354,8 +354,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; __u32 seq; int err; @@ -499,11 +499,12 @@ out: void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { th->check = ~tcp_v4_check(len, inet->saddr, inet->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v4_check(len, inet->saddr, inet->daddr, @@ -515,17 +516,18 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) int tcp_v4_gso_send_check(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; struct tcphdr *th; if (!pskb_may_pull(skb, sizeof(*th))) return -EINVAL; - iph = skb->nh.iph; - th = skb->h.th; + iph = ip_hdr(skb); + th = tcp_hdr(skb); th->check = 0; th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; return 0; @@ -546,7 +548,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; #ifdef CONFIG_TCP_MD5SIG @@ -585,7 +587,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len = sizeof(rep.th); #ifdef CONFIG_TCP_MD5SIG - key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL; + key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -597,14 +599,14 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], key, - skb->nh.iph->daddr, - skb->nh.iph->saddr, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th, IPPROTO_TCP, arg.iov[0].iov_len); } #endif - arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, - skb->nh.iph->saddr, /* XXX */ + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; @@ -622,7 +624,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) @@ -670,7 +672,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, * skb->sk) holds true, but we program defensively. */ if (!twsk && skb->sk) { - key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr); + key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); } else if (twsk && twsk->tw_md5_keylen) { tw_key.key = twsk->tw_md5_key; tw_key.keylen = twsk->tw_md5_keylen; @@ -690,14 +692,14 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], key, - skb->nh.iph->daddr, - skb->nh.iph->saddr, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th, IPPROTO_TCP, arg.iov[0].iov_len); } #endif - arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, - skb->nh.iph->saddr, /* XXX */ + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; @@ -745,7 +747,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, skb = tcp_make_synack(sk, dst, req); if (skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); th->check = tcp_v4_check(skb->len, ireq->loc_addr, @@ -781,7 +783,7 @@ static void syn_flood_warning(struct sk_buff *skb) warntime = jiffies; printk(KERN_INFO "possible SYN flooding on port %d. Sending cookies.\n", - ntohs(skb->h.th->dest)); + ntohs(tcp_hdr(skb)->dest)); } } #endif @@ -1133,8 +1135,8 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) */ __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th = skb->h.th; + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); int length = (th->doff << 2) - sizeof(struct tcphdr); int genhash; unsigned char *ptr; @@ -1251,8 +1253,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) struct inet_request_sock *ireq; struct tcp_options_received tmp_opt; struct request_sock *req; - __be32 saddr = skb->nh.iph->saddr; - __be32 daddr = skb->nh.iph->daddr; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES @@ -1327,7 +1329,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->rmt_addr = saddr; ireq->opt = tcp_v4_save_options(sk, skb); if (!want_cookie) - TCP_ECN_create_request(req, skb->h.th); + TCP_ECN_create_request(req, tcp_hdr(skb)); if (want_cookie) { #ifdef CONFIG_SYN_COOKIES @@ -1351,7 +1353,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1375,7 +1377,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " "request from %u.%u.%u.%u/%u\n", NIPQUAD(saddr), - ntohs(skb->h.th->source)); + ntohs(tcp_hdr(skb)->source)); dst_release(dst); goto drop_and_free; } @@ -1439,7 +1441,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = skb->nh.iph->ttl; + newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; @@ -1481,8 +1483,8 @@ exit: static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct iphdr *iph = skb->nh.iph; + struct tcphdr *th = tcp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ @@ -1491,9 +1493,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, - th->source, skb->nh.iph->daddr, - th->dest, inet_iif(skb)); + nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source, + iph->daddr, th->dest, inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1513,15 +1514,17 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) { + const struct iphdr *iph = ip_hdr(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) { + if (!tcp_v4_check(skb->len, iph->saddr, + iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } } - skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, IPPROTO_TCP, 0); if (skb->len <= 76) { @@ -1555,7 +1558,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) { + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } @@ -1563,7 +1566,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1581,7 +1584,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } TCP_CHECK_TIMER(sk); - if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) { + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } @@ -1610,6 +1613,7 @@ csum_err: int tcp_v4_rcv(struct sk_buff *skb) { + const struct iphdr *iph; struct tcphdr *th; struct sock *sk; int ret; @@ -1623,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; - th = skb->h.th; + th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; @@ -1634,23 +1638,21 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb))) + if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) goto bad_packet; - th = skb->h.th; + th = tcp_hdr(skb); + iph = ip_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, th->dest, - inet_iif(skb)); - + sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source, + iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1724,8 +1726,7 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, - skb->nh.iph->daddr, - th->dest, + iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); @@ -1770,7 +1771,7 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; @@ -1791,7 +1792,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; @@ -1890,7 +1891,7 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ - sk_stream_writequeue_purge(sk); + tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); @@ -2293,13 +2294,13 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, req); } -static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) +static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i) { int timer_active; unsigned long timer_expires; - struct tcp_sock *tp = tcp_sk(sp); - const struct inet_connection_sock *icsk = inet_csk(sp); - struct inet_sock *inet = inet_sk(sp); + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); __be32 dest = inet->daddr; __be32 src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); @@ -2311,9 +2312,9 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; - } else if (timer_pending(&sp->sk_timer)) { + } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = sk->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2321,17 +2322,17 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %p %u %u %u %u %d", - i, src, srcp, dest, destp, sp->sk_state, + i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, - sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog : + sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sp), + sock_i_uid(sk), icsk->icsk_probes_out, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, + sock_i_ino(sk), + atomic_read(&sk->sk_refcnt), sk, icsk->icsk_rto, icsk->icsk_ack.ato, (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index f0ebaf0e21c..43294ad9f63 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -218,7 +218,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) * 3. calc smoothed OWD (SOWD). * Most ideas come from the original TCP-LP implementation. */ -static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) { struct lp *lp = inet_csk_ca(sk); s64 mowd = tcp_lp_owd_calculator(sk); @@ -261,11 +261,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); @@ -312,11 +314,11 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) } static struct tcp_congestion_ops tcp_lp = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_lp_rtt_sample, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6b5c64f3c92..a12b08fca5a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -149,7 +149,7 @@ kill_with_rst: tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent_stamp = get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -208,7 +208,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent_stamp = get_seconds(); } inet_twsk_put(tw); @@ -246,7 +246,7 @@ kill: if (paws_reject) NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); - if(!th->rst) { + if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate @@ -324,7 +324,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tcp_alloc_md5sig_pool() == NULL) BUG(); } - } while(0); + } while (0); #endif /* Linkage updates. */ @@ -387,8 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; + newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; tcp_prequeue_init(newtp); @@ -422,10 +422,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->rcv_wup = treq->rcv_isn + 1; newtp->write_seq = treq->snt_isn + 1; newtp->pushed_seq = newtp->write_seq; - newtp->copied_seq = treq->rcv_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -440,7 +438,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { + if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { if (sysctl_tcp_fack) newtp->rx_opt.sack_ok |= 2; } @@ -455,12 +453,13 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } - newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; + newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << + newtp->rx_opt.snd_wscale); newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; + newtp->rx_opt.ts_recent_stamp = get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; @@ -490,7 +489,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, struct request_sock **prev) { - struct tcphdr *th = skb->h.th; + const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -506,7 +505,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -712,8 +711,8 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); - + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), + skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent, 0); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dc151139b5a..0faacf9c419 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,14 +62,13 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -static void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct sk_buff *skb) { - sk->sk_send_head = skb->next; - if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) - sk->sk_send_head = NULL; + struct tcp_sock *tp = tcp_sk(sk); + + tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); + tcp_packets_out_inc(sk, skb); } /* SND.NXT, if window was not shrunk. @@ -78,8 +77,10 @@ static void update_send_head(struct sock *sk, struct tcp_sock *tp, * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ -static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) +static inline __u32 tcp_acceptable_seq(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) return tp->snd_nxt; else @@ -238,7 +239,7 @@ static u16 tcp_select_window(struct sock *sk) u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ - if(new_win < cur_win) { + if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero @@ -289,10 +290,12 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + + for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } + if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks--; @@ -337,7 +340,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, */ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); if (ts) { - if(sack) + if (sack) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -349,7 +352,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); /* TSVAL */ *ptr++ = htonl(ts_recent); /* TSECR */ - } else if(sack) + } else if (sack) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | @@ -406,7 +409,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* If congestion control is doing timestamping, we must * take such a timestamp before we potentially clone/copy. */ - if (icsk->icsk_ca_ops->rtt_sample) + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) __net_timestamp(skb); if (likely(clone_it)) { @@ -430,7 +433,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, sysctl_flags = 0; if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { + if (sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } @@ -465,11 +468,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; #endif - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ + th = tcp_hdr(skb); th->source = inet->sport; th->dest = inet->dport; th->seq = htonl(tcb->seq); @@ -515,7 +519,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, md5 ? &md5_hash_location : #endif NULL); - TCP_ECN_send(sk, tp, skb, tcp_header_size); + TCP_ECN_send(sk, skb, tcp_header_size); } #ifdef CONFIG_TCP_MD5SIG @@ -524,7 +528,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tp->af_specific->calc_md5_hash(md5_hash_location, md5, sk, NULL, NULL, - skb->h.th, + tcp_hdr(skb), sk->sk_protocol, skb->len); } @@ -545,7 +549,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(err <= 0)) return err; - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); return net_xmit_eval(err); @@ -567,12 +571,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); - __skb_queue_tail(&sk->sk_write_queue, skb); + tcp_add_write_queue_tail(sk, skb); sk_charge_skb(sk, skb); - - /* Queue it, remembering where we must start sending. */ - if (sk->sk_send_head == NULL) - sk->sk_send_head = skb; } static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) @@ -705,7 +705,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff, &sk->sk_write_queue); + tcp_insert_write_queue_after(skb, buff, sk); return 0; } @@ -736,7 +736,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) } skb_shinfo(skb)->nr_frags = k; - skb->tail = skb->data; + skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; } @@ -930,8 +930,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out = tp->packets_out; if (packets_out >= tp->snd_cwnd) { @@ -943,7 +944,8 @@ static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -1033,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1055,7 +1059,7 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, uns return !after(end_seq, tp->snd_una + tp->snd_wnd); } -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) +/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) * should be put on the wire right now. If so, it returns the number of * packets allowed by the congestion window. */ @@ -1078,15 +1082,10 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return cwnd_quota; } -static inline int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) +int tcp_may_send_now(struct sock *sk) { - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_send_head(sk); return (skb && tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), @@ -1142,7 +1141,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff, &sk->sk_write_queue); + tcp_insert_write_queue_after(skb, buff, sk); return 0; } @@ -1152,8 +1151,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; @@ -1248,10 +1248,10 @@ static int tcp_mtu_probe(struct sock *sk) /* Have enough data in the send queue to probe? */ len = 0; - if ((skb = sk->sk_send_head) == NULL) + if ((skb = tcp_send_head(sk)) == NULL) return -1; while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) - skb = skb->next; + skb = tcp_write_queue_next(sk, skb); if (len < probe_size) return -1; @@ -1278,9 +1278,9 @@ static int tcp_mtu_probe(struct sock *sk) return -1; sk_charge_skb(sk, nskb); - skb = sk->sk_send_head; - __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); - sk->sk_send_head = nskb; + skb = tcp_send_head(sk); + tcp_insert_write_queue_before(nskb, skb, sk); + tcp_advance_send_head(sk, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; @@ -1291,7 +1291,7 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; while (len < probe_size) { - next = skb->next; + next = tcp_write_queue_next(sk, skb); copy = min_t(int, skb->len, probe_size - len); if (nskb->ip_summed) @@ -1304,7 +1304,7 @@ static int tcp_mtu_probe(struct sock *sk) /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & @@ -1332,7 +1332,7 @@ static int tcp_mtu_probe(struct sock *sk) /* Decrement cwnd here because we are sending * effectively two packets. */ tp->snd_cwnd--; - update_send_head(sk, tp, nskb); + update_send_head(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; @@ -1376,7 +1376,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) sent_pkts = 1; } - while ((skb = sk->sk_send_head)) { + while ((skb = tcp_send_head(sk))) { unsigned int limit; tso_segs = tcp_init_tso_segs(sk, skb, mss_now); @@ -1395,7 +1395,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (tcp_tso_should_defer(sk, tp, skb)) + if (tcp_tso_should_defer(sk, skb)) break; } @@ -1424,31 +1424,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ - update_send_head(sk, tp, skb); + update_send_head(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; } if (likely(sent_pkts)) { - tcp_cwnd_validate(sk, tp); + tcp_cwnd_validate(sk); return 0; } - return !tp->packets_out && sk->sk_send_head; + return !tp->packets_out && tcp_send_head(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ -void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned int cur_mss, int nonagle) +void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) { - struct sk_buff *skb = sk->sk_send_head; + struct sk_buff *skb = tcp_send_head(sk); if (skb) { if (tcp_write_xmit(sk, cur_mss, nonagle)) - tcp_check_probe_timer(sk, tp); + tcp_check_probe_timer(sk); } } @@ -1458,7 +1458,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; + struct sk_buff *skb = tcp_send_head(sk); unsigned int tso_segs, cwnd_quota; BUG_ON(!skb || skb->len < mss_now); @@ -1492,8 +1492,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { - update_send_head(sk, tp, skb); - tcp_cwnd_validate(sk, tp); + update_send_head(sk, skb); + tcp_cwnd_validate(sk); return; } } @@ -1607,6 +1607,9 @@ u32 __tcp_select_window(struct sock *sk) */ if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; + else if (mss == full_space && + free_space > window + full_space/2) + window = free_space; } return window; @@ -1616,7 +1619,7 @@ u32 __tcp_select_window(struct sock *sk) static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = skb->next; + struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. @@ -1626,7 +1629,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m u16 flags = TCP_SKB_CB(skb)->flags; /* Also punt if next skb has been SACK'd. */ - if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) return; /* Next skb is out of window. */ @@ -1648,9 +1651,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m clear_all_retrans_hints(tp); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, &sk->sk_write_queue); + tcp_unlink_write_queue(next_skb, sk); - memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb_copy_from_linear_data(next_skb, + skb_put(skb, next_skb_size), + next_skb_size); if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -1702,7 +1707,9 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk, 0); int lost = 0; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (skb->len > mss && !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { @@ -1784,13 +1791,13 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } /* Collapse two adjacent packets if worthwhile and we can. */ - if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (cur_mss >> 1)) && - (skb->next != sk->sk_send_head) && - (skb->next != (struct sk_buff *)&sk->sk_write_queue) && - (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && - (sysctl_tcp_retrans_collapse != 0)) + if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && + (!tcp_skb_is_last(sk, skb)) && + (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && + (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) @@ -1800,9 +1807,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * retransmit when old data is attached. So strip it off * since it is cheap to do so and saves bytes on the network. */ - if(skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; skb_shinfo(skb)->gso_segs = 1; @@ -1868,15 +1875,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) skb = tp->retransmit_skb_hint; packet_cnt = tp->retransmit_cnt_hint; }else{ - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); packet_cnt = 0; } /* First pass: retransmit lost packets. */ if (tp->lost_out) { - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + if (skb == tcp_send_head(sk)) + break; /* we could do better than to assign each time */ tp->retransmit_skb_hint = skb; tp->retransmit_cnt_hint = packet_cnt; @@ -1902,8 +1911,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - if (skb == - skb_peek(&sk->sk_write_queue)) + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); @@ -1933,18 +1941,20 @@ void tcp_xmit_retransmit_queue(struct sock *sk) * segments to send. */ - if (tcp_may_send_now(sk, tp)) + if (tcp_may_send_now(sk)) return; if (tp->forward_skb_hint) { skb = tp->forward_skb_hint; packet_cnt = tp->forward_cnt_hint; } else{ - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); packet_cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; tp->forward_cnt_hint = packet_cnt; tp->forward_skb_hint = skb; @@ -1969,7 +1979,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; } - if (skb == skb_peek(&sk->sk_write_queue)) + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); @@ -1985,7 +1995,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) void tcp_send_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); + struct sk_buff *skb = tcp_write_queue_tail(sk); int mss_now; /* Optimization, tack on the FIN if we have a queue of @@ -1994,7 +2004,7 @@ void tcp_send_fin(struct sock *sk) */ mss_now = tcp_current_mss(sk, 1); - if (sk->sk_send_head != NULL) { + if (tcp_send_head(sk) != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; @@ -2021,17 +2031,16 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ @@ -2051,7 +2060,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_shinfo(skb)->gso_type = 0; /* Send it off. */ - TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) @@ -2067,7 +2076,7 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff* skb; - skb = skb_peek(&sk->sk_write_queue); + skb = tcp_write_queue_head(sk); if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; @@ -2077,9 +2086,9 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); - __skb_queue_head(&sk->sk_write_queue, nskb); + __tcp_add_write_queue_head(sk, nskb); sk_stream_free_skb(sk, skb); sk_charge_skb(sk, nskb); skb = nskb; @@ -2129,8 +2138,10 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (md5) tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; #endif - skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + th = tcp_hdr(skb); memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; @@ -2184,7 +2195,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, tp->af_specific->calc_md5_hash(md5_hash_location, md5, NULL, dst, req, - skb->h.th, sk->sk_protocol, + tcp_hdr(skb), sk->sk_protocol, skb->len); } #endif @@ -2267,7 +2278,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_ECN_send_syn(sk, tp, buff); + TCP_ECN_send_syn(sk, buff); TCP_SKB_CB(buff)->sacked = 0; skb_shinfo(buff)->gso_segs = 1; skb_shinfo(buff)->gso_size = 0; @@ -2281,7 +2292,7 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); - __skb_queue_tail(&sk->sk_write_queue, buff); + __tcp_add_write_queue_tail(sk, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); @@ -2359,7 +2370,6 @@ void tcp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; /* We are not putting this on the write queue, so @@ -2385,7 +2395,7 @@ void tcp_send_ack(struct sock *sk) skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk); TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } @@ -2437,7 +2447,7 @@ int tcp_write_wakeup(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if ((skb = sk->sk_send_head) != NULL && + if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; unsigned int mss = tcp_current_mss(sk, 0); @@ -2463,7 +2473,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { - update_send_head(sk, tp, skb); + update_send_head(sk, skb); } return err; } else { @@ -2487,7 +2497,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk); - if (tp->packets_out || !sk->sk_send_head) { + if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 61f406f2729..3938d5dbdf2 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -26,6 +26,8 @@ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/kfifo.h> +#include <linux/ktime.h> +#include <linux/time.h> #include <linux/vmalloc.h> #include <net/tcp.h> @@ -34,43 +36,45 @@ MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>"); MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); -static int port = 0; +static int port __read_mostly = 0; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); -static int bufsize = 64*1024; +static int bufsize __read_mostly = 64*1024; MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); module_param(bufsize, int, 0); +static int full __read_mostly; +MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); +module_param(full, int, 0); + static const char procname[] = "tcpprobe"; struct { - struct kfifo *fifo; - spinlock_t lock; + struct kfifo *fifo; + spinlock_t lock; wait_queue_head_t wait; - struct timeval tstart; + ktime_t start; + u32 lastcwnd; } tcpw; +/* + * Print to log with timestamps. + * FIXME: causes an extra copy + */ static void printl(const char *fmt, ...) { va_list args; int len; - struct timeval now; + struct timespec tv; char tbuf[256]; va_start(args, fmt); - do_gettimeofday(&now); + /* want monotonic time since start of tcp_probe */ + tv = ktime_to_timespec(ktime_sub(ktime_get(), tcpw.start)); - now.tv_sec -= tcpw.tstart.tv_sec; - now.tv_usec -= tcpw.tstart.tv_usec; - if (now.tv_usec < 0) { - --now.tv_sec; - now.tv_usec += 1000000; - } - - len = sprintf(tbuf, "%lu.%06lu ", - (unsigned long) now.tv_sec, - (unsigned long) now.tv_usec); + len = sprintf(tbuf, "%lu.%09lu ", + (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec); len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); @@ -78,38 +82,44 @@ static void printl(const char *fmt, ...) wake_up(&tcpw.wait); } -static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +/* + * Hook inserted to be called before each receive packet. + * Note: arguments must match tcp_rcv_established()! + */ +static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); - if (port == 0 || ntohs(inet->dport) == port || - ntohs(inet->sport) == port) { + /* Only update if port matches */ + if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) + && (full || tp->snd_cwnd != tcpw.lastcwnd)) { printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", NIPQUAD(inet->saddr), ntohs(inet->sport), NIPQUAD(inet->daddr), ntohs(inet->dport), - size, tp->snd_nxt, tp->snd_una, + skb->len, tp->snd_nxt, tp->snd_una, tp->snd_cwnd, tcp_current_ssthresh(sk), - tp->snd_wnd); + tp->snd_wnd, tp->srtt >> 3); + tcpw.lastcwnd = tp->snd_cwnd; } jprobe_return(); return 0; } -static struct jprobe tcp_send_probe = { +static struct jprobe tcp_probe = { .kp = { - .symbol_name = "tcp_sendmsg", + .symbol_name = "tcp_rcv_established", }, - .entry = JPROBE_ENTRY(jtcp_sendmsg), + .entry = JPROBE_ENTRY(jtcp_rcv_established), }; static int tcpprobe_open(struct inode * inode, struct file * file) { kfifo_reset(tcpw.fifo); - do_gettimeofday(&tcpw.tstart); + tcpw.start = ktime_get(); return 0; } @@ -162,7 +172,7 @@ static __init int tcpprobe_init(void) if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) goto err0; - ret = register_jprobe(&tcp_send_probe); + ret = register_jprobe(&tcp_probe); if (ret) goto err1; @@ -180,7 +190,7 @@ static __exit void tcpprobe_exit(void) { kfifo_free(tcpw.fifo); proc_net_remove(procname); - unregister_jprobe(&tcp_send_probe); + unregister_jprobe(&tcp_probe); } module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a9243cfc1be..2ca97b20929 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -233,7 +233,7 @@ static void tcp_probe_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int max_probes; - if (tp->packets_out || !sk->sk_send_head) { + if (tp->packets_out || !tcp_send_head(sk)) { icsk->icsk_probes_out = 0; return; } @@ -284,7 +284,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); + BUG_TRAP(!tcp_write_queue_empty(sk)); if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { @@ -306,7 +306,7 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk, 0); - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); __sk_dst_reset(sk); goto out_reset_timer; } @@ -341,7 +341,7 @@ static void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk, 0); } - if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -482,7 +482,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || sk->sk_send_head) + if (tp->packets_out || tcp_send_head(sk)) goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 5c484dceb96..73e19cf7df2 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -38,6 +38,8 @@ #include <net/tcp.h> +#include "tcp_vegas.h" + /* Default values of the Vegas variables, in fixed-point representation * with V_PARAM_SHIFT bits to the right of the binary point. */ @@ -54,17 +56,6 @@ module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); -/* Vegas variables */ -struct vegas { - u32 beg_snd_nxt; /* right edge during last RTT */ - u32 beg_snd_una; /* left edge during last RTT */ - u32 beg_snd_cwnd; /* saves the size of the cwnd */ - u8 doing_vegas_now;/* if true, do vegas for this RTT */ - u16 cntRTT; /* # of RTTs measured within last RTT */ - u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ -}; - /* There are several situations when we must "re-start" Vegas: * * o when a connection is established @@ -81,7 +72,7 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct sock *sk) +static void vegas_enable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); @@ -104,13 +95,14 @@ static inline void vegas_disable(struct sock *sk) vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct sock *sk) +void tcp_vegas_init(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; vegas_enable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_init); /* Do RTT sampling needed for Vegas. * Basically we: @@ -120,10 +112,13 @@ static void tcp_vegas_init(struct sock *sk) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct vegas *vegas = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = ktime_to_us(net_timedelta(last)) + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) @@ -135,8 +130,9 @@ static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) vegas->minRTT = min(vegas->minRTT, vrtt); vegas->cntRTT++; } +EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); -static void tcp_vegas_state(struct sock *sk, u8 ca_state) +void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) @@ -144,6 +140,7 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) else vegas_disable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_state); /* * If the connection is idle and we are restarting, @@ -154,12 +151,13 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_vegas_init(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) @@ -336,30 +334,29 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info *info; - - info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, - sizeof(*info))); - - info->tcpv_enabled = ca->doing_vegas_now; - info->tcpv_rttcnt = ca->cntRTT; - info->tcpv_rtt = ca->baseRTT; - info->tcpv_minrtt = ca->minRTT; - rtattr_failure: ; + struct tcpvegas_info info = { + .tcpv_enabled = ca->doing_vegas_now, + .tcpv_rttcnt = ca->cntRTT, + .tcpv_rtt = ca->baseRTT, + .tcpv_minrtt = ca->minRTT, + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } +EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_vegas_rtt_calc, + .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h new file mode 100644 index 00000000000..502fa818363 --- /dev/null +++ b/net/ipv4/tcp_vegas.h @@ -0,0 +1,24 @@ +/* + * TCP Vegas congestion control interface + */ +#ifndef __TCP_VEGAS_H +#define __TCP_VEGAS_H 1 + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +extern void tcp_vegas_init(struct sock *sk); +extern void tcp_vegas_state(struct sock *sk, u8 ca_state); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last); +extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); + +#endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ce57bf302f6..9edb340f2f9 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,10 +69,13 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct veno *veno = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = ktime_to_us(net_timedelta(last)) + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) @@ -199,10 +202,11 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .rtt_sample = tcp_veno_rtt_calc, + .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 4e1b61032a9..e61e09dd513 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -100,7 +100,7 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct westwood *w = inet_csk_ca(sk); if (cnt > 0) @@ -226,7 +226,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - switch(event) { + switch (event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); break; @@ -260,16 +260,13 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct rtattr *rta; - struct tcpvegas_info *info; - - rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); - info = RTA_DATA(rta); - info->tcpv_enabled = 1; - info->tcpv_rttcnt = 0; - info->tcpv_rtt = jiffies_to_usecs(ca->rtt); - info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); - rtattr_failure: ; + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rtt = jiffies_to_usecs(ca->rtt), + .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c new file mode 100644 index 00000000000..545ed237ab5 --- /dev/null +++ b/net/ipv4/tcp_yeah.c @@ -0,0 +1,268 @@ +/* + * + * YeAH TCP + * + * For further details look at: + * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * + */ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> + +#include <net/tcp.h> + +#include "tcp_vegas.h" + +#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck +#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt +#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss +#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion +#define TCP_YEAH_PHY 8 //lin maximum delta from base +#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss +#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count + +#define TCP_SCALABLE_AI_CNT 100U + +/* YeAH variables */ +struct yeah { + struct vegas vegas; /* must be first */ + + /* YeAH */ + u32 lastQ; + u32 doing_reno_now; + + u32 reno_count; + u32 fast_count; + + u32 pkts_acked; +}; + +static void tcp_yeah_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + tcp_vegas_init(sk); + + yeah->doing_reno_now = 0; + yeah->lastQ = 0; + + yeah->reno_count = 2; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); + +} + + +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (icsk->icsk_ca_state == TCP_CA_Open) + yeah->pkts_acked = pkts_acked; + + tcp_vegas_pkts_acked(sk, pkts_acked, last); +} + +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else if (!yeah->doing_reno_now) { + /* Scalable */ + + tp->snd_cwnd_cnt+=yeah->pkts_acked; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + + yeah->pkts_acked = 1; + + } else { + /* Reno */ + + if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + + /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) + * + * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up yeahly with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, yeah->vegas.beg_snd_nxt)) { + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (yeah->vegas.cntRTT > 2) { + u32 rtt, queue; + u64 bw; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = yeah->vegas.minRTT; + + /* Compute excess number of packets above bandwidth + * Avoid doing full 64 bit divide. + */ + bw = tp->snd_cwnd; + bw *= rtt - yeah->vegas.baseRTT; + do_div(bw, rtt); + queue = bw; + + if (queue > TCP_YEAH_ALPHA || + rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { + if (queue > TCP_YEAH_ALPHA + && tp->snd_cwnd > yeah->reno_count) { + u32 reduction = min(queue / TCP_YEAH_GAMMA , + tp->snd_cwnd >> TCP_YEAH_EPSILON); + + tp->snd_cwnd -= reduction; + + tp->snd_cwnd = max(tp->snd_cwnd, + yeah->reno_count); + + tp->snd_ssthresh = tp->snd_cwnd; + } + + if (yeah->reno_count <= 2) + yeah->reno_count = max(tp->snd_cwnd>>1, 2U); + else + yeah->reno_count++; + + yeah->doing_reno_now = min(yeah->doing_reno_now + 1, + 0xffffffU); + } else { + yeah->fast_count++; + + if (yeah->fast_count > TCP_YEAH_ZETA) { + yeah->reno_count = 2; + yeah->fast_count = 0; + } + + yeah->doing_reno_now = 0; + } + + yeah->lastQ = queue; + + } + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; + yeah->vegas.beg_snd_nxt = tp->snd_nxt; + yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Wipe the slate clean for the next RTT. */ + yeah->vegas.cntRTT = 0; + yeah->vegas.minRTT = 0x7fffffff; + } +} + +static u32 tcp_yeah_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + u32 reduction; + + if (yeah->doing_reno_now < TCP_YEAH_RHO) { + reduction = yeah->lastQ; + + reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + + reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + } else + reduction = max(tp->snd_cwnd>>1,2U); + + yeah->fast_count = 0; + yeah->reno_count = max(yeah->reno_count>>1, 2U); + + return tp->snd_cwnd - reduction; +} + +static struct tcp_congestion_ops tcp_yeah = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_yeah_init, + .ssthresh = tcp_yeah_ssthresh, + .cong_avoid = tcp_yeah_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + .pkts_acked = tcp_yeah_pkts_acked, + + .owner = THIS_MODULE, + .name = "yeah", +}; + +static int __init tcp_yeah_register(void) +{ + BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_yeah); + return 0; +} + +static void __exit tcp_yeah_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_yeah); +} + +module_init(tcp_yeah_register); +module_exit(tcp_yeah_unregister); + +MODULE_AUTHOR("Angelo P. Castellani"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("YeAH TCP"); diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h new file mode 100644 index 00000000000..ed3b7198f23 --- /dev/null +++ b/net/ipv4/tcp_yeah.h @@ -0,0 +1,7 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> +#include <asm/div64.h> + +#include <net/tcp.h> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fc620a7c1db..113e0c4c8a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +/* + * Note about this hash function : + * Typical use is probably daddr = 0, only dport is going to vary hash + */ +static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) +{ + addr ^= addr >> 16; + addr ^= addr >> 8; + return port ^ addr; +} + +static inline int __udp_lib_port_inuse(unsigned int hash, int port, + __be32 daddr, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; + struct inet_sock *inet; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + if (sk->sk_hash != hash) + continue; + inet = inet_sk(sk); + if (inet->num != port) + continue; + if (inet->rcv_saddr == daddr) return 1; + } return 0; } @@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_node *node; struct hlist_head *head; struct sock *sk2; + unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -175,12 +197,23 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ; } result = best; - for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; + i++, result += UDP_HTABLE_SIZE) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) + hash = hash_port_and_addr(result, 0); + if (__udp_lib_port_inuse(hash, result, + 0, udptable)) + continue; + if (!inet_sk(sk)->rcv_saddr) + break; + + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + if (! __udp_lib_port_inuse(hash, result, + inet_sk(sk)->rcv_saddr, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -188,21 +221,41 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(snum, 0); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && - sk2 != sk && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (*saddr_comp)(sk, sk2) ) + if (sk2->sk_hash == hash && + sk2 != sk && + inet_sk(sk2)->num == snum && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) goto fail; + + if (inet_sk(sk)->rcv_saddr) { + hash = hash_port_and_addr(snum, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; + + sk_for_each(sk2, node, head) + if (sk2->sk_hash == hash && + sk2 != sk && + inet_sk(sk2)->num == snum && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == + sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) + goto fail; + } } inet_sk(sk)->num = snum; - sk->sk_hash = snum; + sk->sk_hash = hash; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -212,13 +265,13 @@ fail: return error; } -__inline__ int udp_get_port(struct sock *sk, unsigned short snum, +int udp_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)) { return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); } -inline int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -241,63 +294,77 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + unsigned int hash, hashwild; + int score, best = -1, hport = ntohs(dport); + + hash = hash_port_and_addr(hport, daddr); + hashwild = hash_port_and_addr(hport, 0); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + +lookup: + + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if(score == 9) { - result = sk; - break; - } else if(score > badness) { - result = sk; - badness = score; - } + if (sk->sk_hash != hash || ipv6_only_sock(sk) || + inet->num != hport) + continue; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + goto found; + } else if (score > best) { + result = sk; + best = score; + } + } + + if (hash != hashwild) { + hash = hashwild; + goto lookup; } +found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, - __be16 loc_port, __be32 loc_addr, +static inline struct sock *udp_v4_mcast_next(struct sock *sk, unsigned int hnum, + int hport, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif) { struct hlist_node *node; struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || + inet->num != hport || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -329,8 +396,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) struct inet_sock *inet; struct iphdr *iph = (struct iphdr*)skb->data; struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; int harderr; int err; @@ -390,7 +457,7 @@ out: sock_put(sk); } -__inline__ void udp_err(struct sk_buff *skb, u32 info) +void udp_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, udp_hash); } @@ -419,13 +486,14 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, int len ) { unsigned int offset; - struct udphdr *uh = skb->h.uh; + struct udphdr *uh = udp_hdr(skb); __wsum csum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { /* * Only one fragment on the socket. */ + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { @@ -434,7 +502,7 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, * fragments on the socket so that all csums of sk_buffs * should be together */ - offset = skb->h.raw - skb->data; + offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); skb->ip_summed = CHECKSUM_NONE; @@ -469,7 +537,7 @@ static int udp_push_pending_frames(struct sock *sk) /* * Create a UDP header */ - uh = skb->h.uh; + uh = udp_hdr(skb); uh->source = fl->fl_ip_sport; uh->dest = fl->fl_ip_dport; uh->len = htons(up->len); @@ -765,38 +833,38 @@ out: int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - switch(cmd) + switch (cmd) { + case SIOCOUTQ: { - case SIOCOUTQ: - { - int amount = atomic_read(&sk->sk_wmem_alloc); - return put_user(amount, (int __user *)arg); - } + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } - case SIOCINQ: - { - struct sk_buff *skb; - unsigned long amount; - - amount = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount - * of this packet since that is all - * that will be read. - */ - amount = skb->len - sizeof(struct udphdr); - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + case SIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount = skb->len - sizeof(struct udphdr); } + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } - default: - return -ENOIOCTLCMD; + default: + return -ENOIOCTLCMD; } - return(0); + + return 0; } /* @@ -810,7 +878,9 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - int copied, err, copy_only, is_udplite = IS_UDPLITE(sk); + unsigned int ulen, copied; + int err; + int is_udplite = IS_UDPLITE(sk); /* * Check any passed addresses @@ -826,28 +896,25 @@ try_again: if (!skb) goto out; - copied = skb->len - sizeof(struct udphdr); - if (copied > len) { - copied = len; + ulen = skb->len - sizeof(struct udphdr); + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; - } /* - * Decide whether to checksum and/or copy data. - * - * UDP: checksum may have been computed in HW, - * (re-)compute it if message is truncated. - * UDP-Lite: always needs to checksum, no HW support. + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. */ - copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY); - if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) { - if (__udp_lib_checksum_complete(skb)) + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) goto csum_copy_err; - copy_only = 1; } - if (copy_only) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied ); else { @@ -866,8 +933,8 @@ try_again: if (sin) { sin->sin_family = AF_INET; - sin->sin_port = skb->h.uh->source; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = udp_hdr(skb)->source; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) @@ -875,7 +942,7 @@ try_again: err = copied; if (flags & MSG_TRUNC) - err = skb->len - sizeof(struct udphdr); + err = ulen; out_free: skb_free_datagram(sk, skb); @@ -949,7 +1016,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) return 1; /* Now we can get the pointers */ - uh = skb->h.uh; + uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; @@ -959,7 +1026,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return 0; - } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else @@ -990,7 +1057,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) return 0; /* Now we can update and verify the packet length... */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { @@ -1002,7 +1069,8 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) * transport header to point to ESP. Keep UDP on the stack * for later. */ - skb->h.raw = skb_pull(skb, len); + __skb_pull(skb, len); + skb_reset_transport_header(skb); /* modify the protocol (it's ESP!) */ iph->protocol = IPPROTO_ESP; @@ -1095,10 +1163,9 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (__udp_lib_checksum_complete(skb)) + if (sk->sk_filter) { + if (udp_lib_checksum_complete(skb)) goto drop; - skb->ip_summed = CHECKSUM_UNNECESSARY; } if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { @@ -1128,33 +1195,49 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk; + struct sock *sk, *skw, *sknext; int dif; + int hport = ntohs(uh->dest); + unsigned int hash = hash_port_and_addr(hport, daddr); + unsigned int hashwild = hash_port_and_addr(hport, 0); - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + read_lock(&udp_hash_lock); + + sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); + skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); + + sk = udp_v4_mcast_next(sk, hash, hport, daddr, uh->source, saddr, dif); + if (!sk) { + hash = hashwild; + sk = udp_v4_mcast_next(skw, hash, hport, daddr, uh->source, + saddr, dif); + } + if (sk) { do { struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if(sknext) + sknext = udp_v4_mcast_next(sk_next(sk), hash, hport, + daddr, uh->source, saddr, dif); + if (!sknext && hash != hashwild) { + hash = hashwild; + sknext = udp_v4_mcast_next(skw, hash, hport, + daddr, uh->source, saddr, dif); + } + if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); - if(skb1) { + if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ + /* + * we should probably re-process + * instead of dropping packets here. + */ kfree_skb(skb1); } sk = sknext; - } while(sknext); + } while (sknext); } else kfree_skb(skb); read_unlock(&udp_hash_lock); @@ -1166,25 +1249,37 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) +static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) { + const struct iphdr *iph; + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + iph = ip_hdr(skb); if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->len, IPPROTO_UDP, skb->csum )) + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb->len, IPPROTO_UDP, 0); + if (!skb_csum_unnecessary(skb)) + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len, proto, 0); /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - /* UDP = UDP-Lite with a non-partial checksum coverage */ - UDP_SKB_CB(skb)->partial_cov = 0; + return 0; } /* @@ -1192,14 +1287,14 @@ static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) */ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], - int is_udplite) + int proto) { struct sock *sk; - struct udphdr *uh = skb->h.uh; + struct udphdr *uh = udp_hdr(skb); unsigned short ulen; struct rtable *rt = (struct rtable*)skb->dst; - __be32 saddr = skb->nh.iph->saddr; - __be32 daddr = skb->nh.iph->daddr; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; /* * Validate the packet. @@ -1211,24 +1306,21 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (ulen > skb->len) goto short_packet; - if(! is_udplite ) { /* UDP validates ulen. */ - + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; - uh = skb->h.uh; - - udp4_csum_init(skb, uh); - - } else { /* UDP-Lite validates cscov. */ - if (udplite4_csum_init(skb, uh)) - goto csum_error; + uh = udp_hdr(skb); } - if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + if (udp4_csum_init(skb, uh, proto)) + goto csum_error; + + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest, - skb->dev->ifindex, udptable ); + skb->dev->ifindex, udptable); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); @@ -1250,7 +1342,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite); + UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1258,11 +1350,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], * don't wanna listen. Ignore it. */ kfree_skb(skb); - return(0); + return 0; short_packet: LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", - is_udplite? "-Lite" : "", + proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), ulen, @@ -1277,21 +1369,21 @@ csum_error: * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", - is_udplite? "-Lite" : "", + proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), NIPQUAD(daddr), ntohs(uh->dest), ulen); drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); - return(0); + return 0; } -__inline__ int udp_rcv(struct sk_buff *skb) +int udp_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, udp_hash, 0); + return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); } int udp_destroy_sock(struct sock *sk) @@ -1313,13 +1405,13 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int val; int err = 0; - if(optlen<sizeof(int)) + if (optlen<sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; - switch(optname) { + switch (optname) { case UDP_CORK: if (val != 0) { up->corkflag = 1; @@ -1373,7 +1465,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, default: err = -ENOPROTOOPT; break; - }; + } return err; } @@ -1404,15 +1496,15 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, len; - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); - if(len < 0) + if (len < 0) return -EINVAL; - switch(optname) { + switch (optname) { case UDP_CORK: val = up->corkflag; break; @@ -1433,11 +1525,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, default: return -ENOPROTOOPT; - }; + } - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval, &val,len)) + if (copy_to_user(optval, &val,len)) return -EFAULT; return 0; } @@ -1486,15 +1578,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sk_buff *skb; spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL) { - if (udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); - __skb_unlink(skb, rcvq); - kfree_skb(skb); - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } + while ((skb = skb_peek(rcvq)) != NULL && + udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); + __skb_unlink(skb, rcvq); + kfree_skb(skb); } spin_unlock_bh(&rcvq->lock); @@ -1573,7 +1661,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) struct sock *sk = udp_get_first(seq); if (sk) - while(pos && (sk = udp_get_next(seq, sk)) != NULL) + while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index b28fe1edf98..f34fd686a8f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,7 @@ static int udplite_v4_get_port(struct sock *sk, unsigned short snum) static int udplite_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, udplite_hash, 1); + return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); } static void udplite_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 78e80deb7e8..5ceca951d73 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 switch (nexthdr) { case IPPROTO_IPIP: case IPPROTO_IPV6: - *spi = skb->nh.iph->saddr; + *spi = ip_hdr(skb)->saddr; *seq = 0; return 0; } @@ -39,9 +39,9 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 #ifdef CONFIG_NETFILTER static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; - if (skb->dst == NULL) { + const struct iphdr *iph = ip_hdr(skb); + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) goto drop; @@ -55,18 +55,18 @@ drop: int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) { - int err; __be32 spi, seq; struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; struct xfrm_state *x; int xfrm_nr = 0; int decaps = 0; + int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); - if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) + if (err != 0) goto drop; do { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); if (xfrm_nr == XFRM_MAX_DEPTH) goto drop; @@ -113,7 +113,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) break; } - if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) + err = xfrm_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); + if (err < 0) goto drop; } while (!err); @@ -146,15 +147,15 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) return 0; } else { #ifdef CONFIG_NETFILTER - __skb_push(skb, skb->data - skb->nh.raw); - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); + __skb_push(skb, skb->data - skb_network_header(skb)); + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; #else - return -skb->nh.iph->protocol; + return -ip_hdr(skb)->protocol; #endif } diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 89cf59ea7bb..a73e710740c 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -29,32 +29,34 @@ */ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph, *top_iph = NULL; + struct iphdr *iph, *top_iph; int hdrlen, optlen; - iph = skb->nh.iph; - skb->h.ipiph = iph; + iph = ip_hdr(skb); + skb->transport_header = skb->network_header; hdrlen = 0; optlen = iph->ihl * 4 - sizeof(*iph); if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen); - top_iph = skb->nh.iph; - hdrlen = iph->ihl * 4 - optlen; - skb->h.raw += hdrlen; + skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen); + skb_reset_network_header(skb); + top_iph = ip_hdr(skb); + skb->transport_header += sizeof(*iph) - hdrlen; - memmove(top_iph, iph, hdrlen); + memmove(top_iph, iph, sizeof(*iph)); if (unlikely(optlen)) { struct ip_beet_phdr *ph; BUG_ON(optlen < 0); - ph = (struct ip_beet_phdr *)skb->h.raw; + ph = (struct ip_beet_phdr *)skb_transport_header(skb); ph->padlen = 4 - (optlen & 4); - ph->hdrlen = (optlen + ph->padlen + sizeof(*ph)) / 8; + ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->protocol; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); top_iph->protocol = IPPROTO_BEETPH; top_iph->ihl = sizeof(struct iphdr) / 4; @@ -68,46 +70,45 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); int phlen = 0; int optlen = 0; - __u8 ph_nexthdr = 0, protocol = 0; + u8 ph_nexthdr = 0; int err = -EINVAL; - protocol = iph->protocol; - if (unlikely(iph->protocol == IPPROTO_BEETPH)) { - struct ip_beet_phdr *ph = (struct ip_beet_phdr*)(iph + 1); + struct ip_beet_phdr *ph; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; + ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1); - phlen = ph->hdrlen * 8; - optlen = phlen - ph->padlen - sizeof(*ph); + phlen = sizeof(*ph) + ph->padlen; + optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; - if (!pskb_may_pull(skb, phlen)) + if (!pskb_may_pull(skb, phlen + optlen)) goto out; + skb->len -= phlen + optlen; ph_nexthdr = ph->nexthdr; } - skb_push(skb, sizeof(*iph) - phlen + optlen); - memmove(skb->data, skb->nh.raw, sizeof(*iph)); - skb->nh.raw = skb->data; + skb_set_network_header(skb, phlen - sizeof(*iph)); + memmove(skb_network_header(skb), iph, sizeof(*iph)); + skb_set_transport_header(skb, phlen + optlen); + skb->data = skb_transport_header(skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->ihl = (sizeof(*iph) + optlen) / 4; - iph->tot_len = htons(skb->len); + iph->tot_len = htons(skb->len + iph->ihl * 4); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; if (ph_nexthdr) iph->protocol = ph_nexthdr; - else - iph->protocol = protocol; iph->check = 0; - iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; out: return err; diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 92676b7e403..601047161ea 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -23,16 +23,13 @@ */ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph; - int ihl; + struct iphdr *iph = ip_hdr(skb); + int ihl = iph->ihl * 4; - iph = skb->nh.iph; - skb->h.ipiph = iph; - - ihl = iph->ihl * 4; - skb->h.raw += ihl; - - skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + skb->transport_header = skb->network_header + ihl; + skb_push(skb, x->props.header_len); + skb_reset_network_header(skb); + memmove(skb_network_header(skb), iph, ihl); return 0; } @@ -46,12 +43,15 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { - int ihl = skb->data - skb->h.raw; + int ihl = skb->data - skb_transport_header(skb); - if (skb->h.raw != skb->nh.raw) - skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); - skb->nh.iph->tot_len = htons(skb->len + ihl); - skb->h.raw = skb->data; + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ip_hdr(skb)->tot_len = htons(skb->len + ihl); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ceb4376f572..a2f2e6a5ec5 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,8 +16,8 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; + struct iphdr *outer_iph = ip_hdr(skb); + struct iphdr *inner_iph = ipip_hdr(skb); if (INET_ECN_is_ce(outer_iph->tos)) IP_ECN_set_ce(inner_iph); @@ -26,7 +26,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(skb->nh.ipv6h); + IP6_ECN_set_ce(ipv6_hdr(skb)); } /* Add encapsulation header. @@ -46,11 +46,12 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; int flags; - iph = skb->nh.iph; - skb->h.ipiph = iph; + iph = ip_hdr(skb); + skb->transport_header = skb->network_header; - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; + skb_push(skb, x->props.header_len); + skb_reset_network_header(skb); + top_iph = ip_hdr(skb); top_iph->ihl = 5; top_iph->version = 4; @@ -90,10 +91,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); + const unsigned char *old_mac; int err = -EINVAL; - switch(iph->protocol){ + switch (iph->protocol){ case IPPROTO_IPIP: break; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -111,10 +113,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IPIP) { if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); + ipv4_copy_dscp(iph, ipip_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); } @@ -125,9 +127,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); } #endif - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; + old_mac = skb_mac_header(skb); + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_reset_network_header(skb); err = 0; out: diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 038ca160fe2..44ef208a75c 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -22,14 +22,13 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst; - struct iphdr *iph = skb->nh.iph; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; dst = skb->dst; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 5d51a2af34c..4ff8ed30024 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -119,7 +119,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int if (xfrm[i]->props.mode == XFRM_MODE_TUNNEL) { unsigned short encap_family = xfrm[i]->props.family; - switch(encap_family) { + switch (encap_family) { case AF_INET: fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; @@ -209,8 +209,8 @@ error: static void _decode_session4(struct sk_buff *skb, struct flowi *fl) { - struct iphdr *iph = skb->nh.iph; - u8 *xprth = skb->nh.raw + iph->ihl*4; + struct iphdr *iph = ip_hdr(skb); + u8 *xprth = skb_network_header(skb) + iph->ihl * 4; memset(fl, 0, sizeof(struct flowi)); if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { @@ -263,7 +263,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) default: fl->fl_ipsec_spi = 0; break; - }; + } } fl->proto = iph->protocol; fl->fl4_dst = iph->daddr; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 3eef06454da..56851030455 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -12,9 +12,8 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph; + struct iphdr *iph = ip_hdr(skb); - iph = skb->nh.iph; iph->tot_len = htons(skb->len); ip_send_check(iph); |