diff options
Diffstat (limited to 'net/ipv4')
31 files changed, 528 insertions, 361 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 566ea6c4321..6c30a73f03f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -124,7 +124,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); struct ipv4_config ipv4_config; - EXPORT_SYMBOL(ipv4_config); /* New destruction routine */ @@ -139,12 +138,12 @@ void inet_sock_destruct(struct sock *sk) sk_mem_reclaim(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { - printk("Attempt to release TCP socket in state %d %p\n", + pr_err("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); return; } if (!sock_flag(sk, SOCK_DEAD)) { - printk("Attempt to release alive inet socket %p\n", sk); + pr_err("Attempt to release alive inet socket %p\n", sk); return; } @@ -157,6 +156,7 @@ void inet_sock_destruct(struct sock *sk) dst_release(sk->sk_dst_cache); sk_refcnt_debug_dec(sk); } +EXPORT_SYMBOL(inet_sock_destruct); /* * The routines beyond this point handle the behaviour of an AF_INET @@ -219,6 +219,7 @@ out: release_sock(sk); return err; } +EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); @@ -435,9 +436,11 @@ int inet_release(struct socket *sock) } return 0; } +EXPORT_SYMBOL(inet_release); /* It is off by default, see below. */ int sysctl_ip_nonlocal_bind __read_mostly; +EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -519,6 +522,7 @@ out_release_sock: out: return err; } +EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, int addr_len, int flags) @@ -532,6 +536,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, return -EAGAIN; return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } +EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo) { @@ -641,6 +646,7 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(inet_stream_connect); /* * Accept a pending connection. The TCP layer now gives BSD semantics. @@ -668,6 +674,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) do_err: return err; } +EXPORT_SYMBOL(inet_accept); /* @@ -699,6 +706,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, *uaddr_len = sizeof(*sin); return 0; } +EXPORT_SYMBOL(inet_getname); int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) @@ -711,9 +719,11 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, return sk->sk_prot->sendmsg(iocb, sk, msg, size); } +EXPORT_SYMBOL(inet_sendmsg); -static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) { struct sock *sk = sock->sk; @@ -780,6 +790,7 @@ int inet_shutdown(struct socket *sock, int how) release_sock(sk); return err; } +EXPORT_SYMBOL(inet_shutdown); /* * ioctl() calls you can issue on an INET socket. Most of these are @@ -798,44 +809,45 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct net *net = sock_net(sk); switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; - case SIOCADDRT: - case SIOCDELRT: - case SIOCRTMSG: - err = ip_rt_ioctl(net, cmd, (void __user *)arg); - break; - case SIOCDARP: - case SIOCGARP: - case SIOCSARP: - err = arp_ioctl(net, cmd, (void __user *)arg); - break; - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - case SIOCSIFPFLAGS: - case SIOCGIFPFLAGS: - case SIOCSIFFLAGS: - err = devinet_ioctl(net, cmd, (void __user *)arg); - break; - default: - if (sk->sk_prot->ioctl) - err = sk->sk_prot->ioctl(sk, cmd, arg); - else - err = -ENOIOCTLCMD; - break; + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *)arg); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + err = ip_rt_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + err = arp_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + err = devinet_ioctl(net, cmd, (void __user *)arg); + break; + default: + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; + break; } return err; } +EXPORT_SYMBOL(inet_ioctl); const struct proto_ops inet_stream_ops = { .family = PF_INET, @@ -862,6 +874,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, #endif }; +EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family = PF_INET, @@ -887,6 +900,7 @@ const struct proto_ops inet_dgram_ops = { .compat_getsockopt = compat_sock_common_getsockopt, #endif }; +EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without @@ -1016,6 +1030,7 @@ out_illegal: p->type); goto out; } +EXPORT_SYMBOL(inet_register_protosw); void inet_unregister_protosw(struct inet_protosw *p) { @@ -1031,6 +1046,7 @@ void inet_unregister_protosw(struct inet_protosw *p) synchronize_net(); } } +EXPORT_SYMBOL(inet_unregister_protosw); /* * Shall we try to damage output packets if routing dev changes? @@ -1141,7 +1157,6 @@ int inet_sk_rebuild_header(struct sock *sk) return err; } - EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) @@ -1187,6 +1202,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) int proto; int ihl; int id; + unsigned int offset = 0; if (!(features & NETIF_F_V4_CSUM)) features &= ~NETIF_F_SG; @@ -1229,7 +1245,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) skb = segs; do { iph = ip_hdr(skb); - iph->id = htons(id++); + if (proto == IPPROTO_UDP) { + iph->id = htons(id); + iph->frag_off = htons(offset >> 3); + if (skb->next != NULL) + iph->frag_off |= htons(IP_MF); + offset += (skb->len - skb->mac_len - iph->ihl * 4); + } else + iph->id = htons(id++); iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); @@ -1361,7 +1384,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } return rc; } - EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void *mib[], int offt) @@ -1425,6 +1447,8 @@ static struct net_protocol tcp_protocol = { static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, + .gso_send_check = udp4_ufo_send_check, + .gso_segment = udp4_ufo_fragment, .no_policy = 1, .netns_ok = 1, }; @@ -1666,19 +1690,3 @@ static int __init ipv4_proc_init(void) MODULE_ALIAS_NETPROTO(PF_INET); -EXPORT_SYMBOL(inet_accept); -EXPORT_SYMBOL(inet_bind); -EXPORT_SYMBOL(inet_dgram_connect); -EXPORT_SYMBOL(inet_dgram_ops); -EXPORT_SYMBOL(inet_getname); -EXPORT_SYMBOL(inet_ioctl); -EXPORT_SYMBOL(inet_listen); -EXPORT_SYMBOL(inet_register_protosw); -EXPORT_SYMBOL(inet_release); -EXPORT_SYMBOL(inet_sendmsg); -EXPORT_SYMBOL(inet_shutdown); -EXPORT_SYMBOL(inet_sock_destruct); -EXPORT_SYMBOL(inet_stream_connect); -EXPORT_SYMBOL(inet_stream_ops); -EXPORT_SYMBOL(inet_unregister_protosw); -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 090e9991ac2..4e80f336c0c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -130,7 +130,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); -static struct neigh_ops arp_generic_ops = { +static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -140,7 +140,7 @@ static struct neigh_ops arp_generic_ops = { .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops arp_hh_ops = { +static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -150,7 +150,7 @@ static struct neigh_ops arp_hh_ops = { .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops arp_direct_ops = { +static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = dev_queue_xmit, .connected_output = dev_queue_xmit, @@ -158,7 +158,7 @@ static struct neigh_ops arp_direct_ops = { .queue_xmit = dev_queue_xmit, }; -struct neigh_ops arp_broken_ops = { +const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 63c2fa7b68c..291bdf50a21 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -48,7 +48,7 @@ * Patrick McHardy <kaber@trash.net> */ -#define VERSION "0.408" +#define VERSION "0.409" #include <asm/uaccess.h> #include <asm/system.h> @@ -164,6 +164,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ static struct tnode *tnode_free_head; +static size_t tnode_free_size; + +/* + * synchronize_rcu after call_rcu for that many pages; it should be especially + * useful before resizing the root node with PREEMPT_NONE configs; the value was + * obtained experimentally, aiming to avoid visible slowdown. + */ +static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; @@ -317,8 +325,7 @@ static inline void check_tnode(const struct tnode *tn) static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; -static const int inflate_threshold_root = 25; - +static const int inflate_threshold_root = 30; static void __alias_free_mem(struct rcu_head *head) { @@ -393,6 +400,8 @@ static void tnode_free_safe(struct tnode *tn) BUG_ON(IS_LEAF(tn)); tn->tnode_free = tnode_free_head; tnode_free_head = tn; + tnode_free_size += sizeof(struct tnode) + + (sizeof(struct node *) << tn->bits); } static void tnode_free_flush(void) @@ -404,6 +413,11 @@ static void tnode_free_flush(void) tn->tnode_free = NULL; tnode_free(tn); } + + if (tnode_free_size >= PAGE_SIZE * sync_pages) { + tnode_free_size = 0; + synchronize_rcu(); + } } static struct leaf *leaf_new(void) @@ -499,14 +513,14 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, rcu_assign_pointer(tn->child[i], n); } +#define MAX_WORK 10 static struct node *resize(struct trie *t, struct tnode *tn) { int i; - int err = 0; struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; - int max_resize; + int max_work; if (!tn) return NULL; @@ -521,18 +535,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) } /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) - for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; - - n = tn->child[i]; - if (!n) - continue; - - /* compress one level */ - node_set_parent(n, NULL); - tnode_free_safe(tn); - return n; - } + goto one_child; /* * Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. @@ -601,14 +604,17 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!tn->parent) + if (!node_parent((struct node*) tn)) { inflate_threshold_use = inflate_threshold_root; - else + halve_threshold_use = halve_threshold_root; + } + else { inflate_threshold_use = inflate_threshold; + halve_threshold_use = halve_threshold; + } - err = 0; - max_resize = 10; - while ((tn->full_children > 0 && max_resize-- && + max_work = MAX_WORK; + while ((tn->full_children > 0 && max_work-- && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold_use * tnode_child_length(tn))) { @@ -625,35 +631,19 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } - if (max_resize < 0) { - if (!tn->parent) - pr_warning("Fix inflate_threshold_root." - " Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); - else - pr_warning("Fix inflate_threshold." - " Now=%d size=%d bits\n", - inflate_threshold, tn->bits); - } - check_tnode(tn); + /* Return if at least one inflate is run */ + if( max_work != MAX_WORK) + return (struct node *) tn; + /* * Halve as long as the number of empty children in this * node is above threshold. */ - - /* Keep root node larger */ - - if (!tn->parent) - halve_threshold_use = halve_threshold_root; - else - halve_threshold_use = halve_threshold; - - err = 0; - max_resize = 10; - while (tn->bits > 1 && max_resize-- && + max_work = MAX_WORK; + while (tn->bits > 1 && max_work-- && 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold_use * tnode_child_length(tn)) { @@ -668,19 +658,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } - if (max_resize < 0) { - if (!tn->parent) - pr_warning("Fix halve_threshold_root." - " Now=%d size=%d bits\n", - halve_threshold_root, tn->bits); - else - pr_warning("Fix halve_threshold." - " Now=%d size=%d bits\n", - halve_threshold, tn->bits); - } /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) + if (tn->empty_children == tnode_child_length(tn) - 1) { +one_child: for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; @@ -694,7 +675,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) tnode_free_safe(tn); return n; } - + } return (struct node *) tn; } @@ -1435,7 +1416,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), pos, bits); - n = tnode_get_child(pn, cindex); + n = tnode_get_child_rcu(pn, cindex); if (n == NULL) { #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -1570,7 +1551,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct node *) pn); if (!parent) goto failed; @@ -1783,7 +1764,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { struct node *c = (struct node *) l; - struct tnode *p = node_parent(c); + struct tnode *p = node_parent_rcu(c); if (!p) return NULL; /* trie with just one leaf */ @@ -2391,7 +2372,7 @@ static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) } } -static const char *rtn_type_names[__RTN_MAX] = { +static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 61283f92882..13f0781f35c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -218,8 +218,8 @@ void inet_twdr_hangman(unsigned long data) /* We purged the entire slot, anything left? */ if (twdr->tw_count) need_timer = 1; + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); } - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 82c11dd10a6..533afaadefd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -662,7 +662,7 @@ drop_nolock: return(0); } -static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->dev->stats; @@ -821,7 +821,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -889,7 +889,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPTUNNEL_XMIT(); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); @@ -898,7 +898,7 @@ tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static int ipgre_tunnel_bind_dev(struct net_device *dev) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7ffcd96fe59..9fe5d7b8158 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1304,7 +1304,7 @@ int ip_push_pending_frames(struct sock *sk) err = ip_local_out(skb); if (err) { if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 93e2b787da2..62548cb0923 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -387,7 +387,7 @@ static int ipip_rcv(struct sk_buff *skb) * and that skb is filled properly by that function. */ -static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->dev->stats; @@ -486,7 +486,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -524,7 +524,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPTUNNEL_XMIT(); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); @@ -532,7 +532,7 @@ tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static void ipip_tunnel_bind_dev(struct net_device *dev) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9a8da5ed92b..65d421cf5bc 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -201,7 +201,7 @@ failure: #ifdef CONFIG_IP_PIMSM -static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); @@ -212,7 +212,7 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } static const struct net_device_ops reg_vif_netdev_ops = { diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 7505dff4ffd..27774c99d88 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -8,7 +8,7 @@ * Copyright (C) 2002 David S. Miller (davem@redhat.com) * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -341,15 +341,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline int unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_arp *arp) { - unsigned int i; + static const struct arpt_arp uncond; - for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) - if (((__u32 *)arp)[i]) - return 0; - - return 1; + return memcmp(arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -537,12 +533,28 @@ out: return ret; } +static bool check_underflow(struct arpt_entry *e) +{ + const struct arpt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->arp)) + return false; + t = arpt_get_target(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct arpt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + static inline int check_entry_size_and_hooks(struct arpt_entry *e, struct xt_table_info *newinfo, unsigned char *base, unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, + unsigned int valid_hooks, unsigned int *i) { unsigned int h; @@ -562,15 +574,21 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; - if ((unsigned char *)e - base == underflows[h]) + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } newinfo->underflow[h] = underflows[h]; + } } - /* FIXME: underflows must be unconditional, standard verdicts - < 0 (not ARPT_RETURN). --RR */ - /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; @@ -630,7 +648,7 @@ static int translate_table(const char *name, newinfo, entry0, entry0 + size, - hook_entries, underflows, &i); + hook_entries, underflows, valid_hooks, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; @@ -1760,7 +1778,8 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, struct xt_table *table, +struct xt_table *arpt_register_table(struct net *net, + const struct xt_table *table, const struct arpt_replace *repl) { int ret; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6ecfdae7c58..97337601827 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -15,7 +15,7 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) -static struct +static const struct { struct arpt_replace repl; struct arpt_standard entries[3]; @@ -45,7 +45,7 @@ static struct .term = ARPT_ERROR_INIT, }; -static struct xt_table packet_filter = { +static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fdefae6b5df..cde755d5eea 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -8,6 +8,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/capability.h> #include <linux/skbuff.h> @@ -190,16 +191,11 @@ get_entry(void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline int -unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_ip *ip) { - unsigned int i; - - for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) - if (((__u32 *)ip)[i]) - return 0; + static const struct ipt_ip uncond; - return 1; + return memcmp(ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -315,7 +311,6 @@ ipt_do_table(struct sk_buff *skb, static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; - u_int16_t datalen; bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; @@ -328,7 +323,6 @@ ipt_do_table(struct sk_buff *skb, /* Initialization */ ip = ip_hdr(skb); - datalen = skb->len - ip->ihl * 4; indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -427,8 +421,6 @@ ipt_do_table(struct sk_buff *skb, #endif /* Target might have changed stuff. */ ip = ip_hdr(skb); - datalen = skb->len - ip->ihl * 4; - if (verdict == IPT_CONTINUE) e = ipt_next_entry(e); else @@ -716,6 +708,21 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, return ret; } +static bool check_underflow(struct ipt_entry *e) +{ + const struct ipt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ip)) + return false; + t = ipt_get_target(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct ipt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, @@ -723,6 +730,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, + unsigned int valid_hooks, unsigned int *i) { unsigned int h; @@ -742,15 +750,21 @@ check_entry_size_and_hooks(struct ipt_entry *e, /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; - if ((unsigned char *)e - base == underflows[h]) + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } newinfo->underflow[h] = underflows[h]; + } } - /* FIXME: underflows must be unconditional, standard verdicts - < 0 (not IPT_RETURN). --RR */ - /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; @@ -813,7 +827,7 @@ translate_table(const char *name, newinfo, entry0, entry0 + size, - hook_entries, underflows, &i); + hook_entries, underflows, valid_hooks, &i); if (ret != 0) return ret; @@ -2051,7 +2065,8 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, +struct xt_table *ipt_register_table(struct net *net, + const struct xt_table *table, const struct ipt_replace *repl) { int ret; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index c30a969724f..df566cbd68e 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -53,11 +53,11 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table packet_filter = { +static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; /* The work comes in here from netfilter.c. */ @@ -102,21 +102,21 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_local_in_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_local_out_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_FILTER, }, diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 4087614d951..036047f9b0f 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,7 +28,7 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_POST_ROUTING)) /* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ -static struct +static const struct { struct ipt_replace repl; struct ipt_standard entries[5]; @@ -64,11 +64,11 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table packet_mangler = { +static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; /* The work comes in here from netfilter.c. */ @@ -162,35 +162,35 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_pre_routing_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_local_in_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_forward_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_local_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_post_routing_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_MANGLE, }, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index e5356da1fb5..993edc23be0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -9,7 +9,7 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static struct +static const struct { struct ipt_replace repl; struct ipt_standard entries[2]; @@ -36,11 +36,11 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table packet_raw = { +static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; /* The work comes in here from netfilter.c. */ @@ -74,14 +74,14 @@ ipt_local_hook(unsigned int hook, static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_hook, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, }, { .hook = ipt_local_hook, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 29ab630f240..99eb76c65d2 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -27,7 +27,7 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static struct +static const struct { struct ipt_replace repl; struct ipt_standard entries[3]; @@ -57,11 +57,11 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table security_table = { +static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; static unsigned int @@ -105,21 +105,21 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_local_in_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_SECURITY, }, { .hook = ipt_forward_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_SECURITY, }, { .hook = ipt_local_out_hook, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_SECURITY, }, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 7d2ead7228a..aa95bb82ee6 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -26,6 +26,7 @@ #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> +#include <net/netfilter/nf_log.h> int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, struct nf_conn *ct, @@ -113,8 +114,11 @@ static unsigned int ipv4_confirm(unsigned int hooknum, ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), ct, ctinfo); - if (ret != NF_ACCEPT) + if (ret != NF_ACCEPT) { + nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, + "nf_ct_%s: dropping packet", helper->name); return ret; + } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { typeof(nf_nat_seq_adjust_hook) seq_adjust; @@ -158,28 +162,28 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, @@ -256,11 +260,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) tuple.dst.u3.ip = inet->daddr; tuple.dst.u.tcp.port = inet->dport; tuple.src.l3num = PF_INET; - tuple.dst.protonum = IPPROTO_TCP; + tuple.dst.protonum = sk->sk_protocol; - /* We only do TCP at the moment: is there a better way? */ - if (strcmp(sk->sk_prot->name, "TCP")) { - pr_debug("SO_ORIGINAL_DST: Not a TCP socket\n"); + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); return -ENOPROTOOPT; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 3229e0a81ba..68afc6ecd34 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, range->flags & IP_NAT_RANGE_PERSISTENT ? - (__force u32)tuple->dst.u3.ip : 0, 0); + 0 : (__force u32)tuple->dst.u3.ip, 0); j = ((u64)j * (maxip - minip + 1)) >> 32; *var_ipp = htonl(minip + j); } @@ -620,7 +620,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { }; static int -nfnetlink_parse_nat(struct nlattr *nat, +nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range *range) { struct nlattr *tb[CTA_NAT_MAX+1]; @@ -656,7 +656,7 @@ nfnetlink_parse_nat(struct nlattr *nat, static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, - struct nlattr *attr) + const struct nlattr *attr) { struct nf_nat_range range; @@ -671,7 +671,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, - struct nlattr *attr) + const struct nlattr *attr) { return -EOPNOTSUPP; } diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 6348a793936..9e81e0dfb4e 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -28,7 +28,7 @@ (1 << NF_INET_POST_ROUTING) | \ (1 << NF_INET_LOCAL_OUT)) -static struct +static const struct { struct ipt_replace repl; struct ipt_standard entries[3]; @@ -58,11 +58,11 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table nat_table = { +static const struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .me = THIS_MODULE, - .af = AF_INET, + .af = NFPROTO_IPV4, }; /* Source NAT */ diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 5567bd0d075..5f41d017ddd 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -251,7 +251,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_in, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, @@ -259,7 +259,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_out, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, @@ -267,7 +267,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_local_fn, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, @@ -275,7 +275,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { { .hook = nf_nat_fn, .owner = THIS_MODULE, - .pf = PF_INET, + .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index ea50da0649f..a2e5fc0a15e 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -22,26 +22,11 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - -#include <asm/uaccess.h> -#include <asm/system.h> +#include <linux/cache.h> #include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/timer.h> -#include <net/ip.h> +#include <linux/spinlock.h> #include <net/protocol.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/ipip.h> -#include <linux/igmp.h> struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; static DEFINE_SPINLOCK(inet_proto_lock); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2979f14bb18..ebb1e5848bc 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; out: @@ -386,6 +386,8 @@ error_fault: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !inet->recverr) + err = 0; return err; } @@ -576,8 +578,11 @@ back_from_confirm: &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); - else if (!(msg->msg_flags & MSG_MORE)) + else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk); + if (err == -ENOBUFS && !inet->recverr) + err = 0; + } release_sock(sk); } done: diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 278f46f5011..91867d3e632 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1514,13 +1514,17 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); - struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + struct in_device *in_dev; + int log_martians; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->u.dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); return; - - if (!IN_DEV_TX_REDIRECTS(in_dev)) - goto out; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. @@ -1533,7 +1537,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { rt->u.dst.rate_last = jiffies; - goto out; + return; } /* Check for load limit; set rate_last to the latest sent @@ -1547,7 +1551,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) rt->u.dst.rate_last = jiffies; ++rt->u.dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && + if (log_martians && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", @@ -1555,8 +1559,6 @@ void ip_rt_send_redirect(struct sk_buff *skb) &rt->rt_dst, &rt->rt_gateway); #endif } -out: - in_dev_put(in_dev); } static int ip_error(struct sk_buff *skb) @@ -3442,7 +3444,7 @@ int __init ip_rt_init(void) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(); + xfrm4_init(ip_rt_max_size); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 91145244ea6..edeea060db4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1839,7 +1839,7 @@ void tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_KERNEL); + tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2336,13 +2336,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: - val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; + val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: - val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; + val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: - val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + val = keepalive_probes(tp); break; case TCP_SYNCNT: val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; @@ -2658,7 +2658,7 @@ void tcp_free_md5sig_pool(void) EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) +static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) { int cpu; struct tcp_md5sig_pool **pool; @@ -2671,7 +2671,7 @@ static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) struct tcp_md5sig_pool *p; struct crypto_hash *hash; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), sk->sk_allocation); if (!p) goto out_free; *per_cpu_ptr(pool, cpu) = p; @@ -2688,7 +2688,7 @@ out_free: return NULL; } -struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) +struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) { struct tcp_md5sig_pool **pool; int alloc = 0; @@ -2709,7 +2709,7 @@ retry: if (alloc) { /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); + struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); spin_lock_bh(&tcp_md5sig_pool_lock); if (!p) { tcp_md5sig_users--; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index e92beb9e55e..6428b342b16 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -116,7 +116,7 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); #ifdef CONFIG_MODULES - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); @@ -246,7 +246,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) #ifdef CONFIG_MODULES /* not found attempt to autoload module */ - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2bdb0da237e..af6d6fa00db 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -685,7 +685,7 @@ static inline void tcp_set_rto(struct sock *sk) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -696,8 +696,7 @@ static inline void tcp_set_rto(struct sock *sk) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + tcp_bound_rto(sk); } /* Save metrics learned by this TCP session. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6d88219c5e2..0543561da99 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -328,26 +328,29 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) * */ -void tcp_v4_err(struct sk_buff *skb, u32 info) +void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { - struct iphdr *iph = (struct iphdr *)skb->data; - struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); + struct iphdr *iph = (struct iphdr *)icmp_skb->data; + struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; + const int type = icmp_hdr(icmp_skb)->type; + const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; + struct sk_buff *skb; __u32 seq; + __u32 remaining; int err; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(icmp_skb->dev); - if (skb->len < (iph->ihl << 2) + 8) { + if (icmp_skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(skb)); + iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -367,6 +370,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; + icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && @@ -393,6 +397,39 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) } err = icmp_err_convert[code].errno; + /* check if icmp_skb allows revert of backoff + * (see draft-zimmermann-tcp-lcd) */ + if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) + break; + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + break; + + icsk->icsk_backoff--; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << + icsk->icsk_backoff; + tcp_bound_rto(sk); + + skb = tcp_write_queue_head(sk); + BUG_ON(!skb); + + remaining = icsk->icsk_rto - min(icsk->icsk_rto, + tcp_time_stamp - TCP_SKB_CB(skb)->when); + + if (remaining) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else if (sock_owned_by_user(sk)) { + /* RTO revert clocked out retransmission, + * but socket is locked. Will defer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + HZ/20, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now */ + tcp_retransmit_timer(sk); + } + break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; @@ -849,7 +886,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } - if (tcp_alloc_md5sig_pool() == NULL) { + if (tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } @@ -970,8 +1007,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (!tcp_sk(sk)->md5sig_info) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL); + struct tcp_md5sig_info *p; + p = kzalloc(sizeof(*p), sk->sk_allocation); if (!p) return -EINVAL; @@ -979,7 +1017,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } - newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); if (!newkey) return -ENOMEM; return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, @@ -1158,7 +1196,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, }; @@ -1717,7 +1755,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct inet_connection_sock_af_ops ipv4_specific = { +const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1737,7 +1775,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { +static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_add = tcp_v4_md5_add_func, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f8d67ccc64f..e48c37d74d7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -322,7 +322,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (key != NULL) { memcpy(&tcptw->tw_md5_key, key->key, key->keylen); tcptw->tw_md5_keylen = key->keylen; - if (tcp_alloc_md5sig_pool() == NULL) + if (tcp_alloc_md5sig_pool(sk) == NULL) BUG(); } } while (0); @@ -657,29 +657,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; -#ifdef CONFIG_TCP_MD5SIG - else { - /* Copy over the MD5 key from the original socket */ - struct tcp_md5sig_key *key; - struct tcp_sock *tp = tcp_sk(sk); - key = tp->af_specific->md5_lookup(sk, child); - if (key != NULL) { - /* - * We're using one, so create a matching key on the - * newsk structure. If we fail to get memory then we - * end up not copying the key across. Shucks. - */ - char *newkey = kmemdup(key->key, key->keylen, - GFP_ATOMIC); - if (newkey) { - if (!tcp_alloc_md5sig_pool()) - BUG(); - tp->af_specific->md5_add(child, child, newkey, - key->keylen); - } - } - } -#endif inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bd62712848f..5200aab0ca9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,7 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +/* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -142,6 +143,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } +/* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { @@ -161,6 +163,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } +/* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); @@ -276,6 +279,7 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } +/* Packet ECN state for a SYN-ACK */ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; @@ -283,6 +287,7 @@ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } +/* Packet ECN state for a SYN. */ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -301,6 +306,9 @@ TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) th->ece = 1; } +/* Set up ECN state for a packet on a ESTABLISHED socket that is about to + * be sent. + */ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { @@ -362,7 +370,9 @@ struct tcp_out_options { __u32 tsval, tsecr; /* need to include OPTION_TS */ }; -/* Beware: Something in the Internet is very sensitive to the ordering of +/* Write previously computed TCP options to the packet. + * + * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operatibility perspective it seems that we're somewhat stuck with @@ -445,6 +455,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } } +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { @@ -493,6 +506,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, return size; } +/* Set up TCP options for SYN-ACKs. */ static unsigned tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned mss, struct sk_buff *skb, @@ -541,6 +555,9 @@ static unsigned tcp_synack_options(struct sock *sk, return size; } +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { @@ -705,7 +722,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return net_xmit_eval(err); } -/* This routine just queue's the buffer +/* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. @@ -722,6 +739,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); } +/* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { @@ -909,6 +927,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) skb->len = skb->data_len; } +/* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -937,7 +956,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Not accounting for SACKs here. */ +/* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -981,6 +1000,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) return mtu; } +/* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1143,7 +1163,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, return 0; } -/* This must be invoked the first time we consider transmitting +/* Intialize TSO state of a skb. + * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, @@ -1158,6 +1179,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, return tso_segs; } +/* Minshall's variant of the Nagle send check. */ static inline int tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && @@ -1242,6 +1264,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return cwnd_quota; } +/* Test if sending is allowed right now. */ int tcp_may_send_now(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1378,6 +1401,10 @@ send_now: } /* Create a new MTU probe if we are ready. + * MTU probe is regularly attempting to increase the path MTU by + * deliberately sending larger packets. This discovers routing + * changes resulting in larger path MTUs. + * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise @@ -1790,6 +1817,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) sk_wmem_free_skb(sk, next_skb); } +/* Check if coalescing SKBs is legal. */ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) @@ -1808,6 +1836,9 @@ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) return 1; } +/* Collapse packets in the retransmit queue to make to create + * less packets on the wire. This is only done on retransmission. + */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { @@ -1957,6 +1988,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } +/* Check if we forward retransmits are possible in the current + * window/congestion state. + */ static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2101,7 +2135,8 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk->sk_allocation); if (skb) break; yield(); @@ -2145,7 +2180,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } -/* WARNING: This routine must only be called when we have already sent +/* Send a crossed SYN-ACK during socket establishment. + * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. @@ -2180,9 +2216,7 @@ int tcp_send_synack(struct sock *sk) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* - * Prepare a SYN-ACK. - */ +/* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req) { @@ -2269,9 +2303,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb; } -/* - * Do all connect socket setups that can be done AF independent. - */ +/* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); @@ -2330,9 +2362,7 @@ static void tcp_connect_init(struct sock *sk) tcp_clear_retrans(tp); } -/* - * Build a SYN and send it off. - */ +/* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2359,7 +2389,7 @@ int tcp_connect(struct sock *sk) sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. @@ -2493,6 +2523,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } +/* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b144a26359b..cdb2ca7684d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -137,13 +137,14 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; + bool do_reset; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) dst_negative_advice(&sk->sk_dst_cache); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -155,13 +156,15 @@ static int tcp_write_timeout(struct sock *sk) const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); + do_reset = alive || + !retransmits_timed_out(sk, retry_until); - if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) + if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (icsk->icsk_retransmits >= retry_until) { + if (retransmits_timed_out(sk, retry_until)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -279,7 +282,7 @@ static void tcp_probe_timer(struct sock *sk) * The TCP retransmit timer. */ -static void tcp_retransmit_timer(struct sock *sk) +void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -385,7 +388,7 @@ static void tcp_retransmit_timer(struct sock *sk) out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_tcp_retries1) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk); out:; @@ -499,8 +502,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = tcp_time_stamp - tp->rcv_tstamp; if (elapsed >= keepalive_time_when(tp)) { - if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || - (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { + if (icsk->icsk_probes_out >= keepalive_probes(tp)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 80e3812837a..ebaaa7f973d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,11 +110,12 @@ struct udp_table udp_table; EXPORT_SYMBOL(udp_table); int sysctl_udp_mem[3] __read_mostly; -int sysctl_udp_rmem_min __read_mostly; -int sysctl_udp_wmem_min __read_mostly; - EXPORT_SYMBOL(sysctl_udp_mem); + +int sysctl_udp_rmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_rmem_min); + +int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; @@ -158,7 +159,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2 ) ) + const struct sock *sk2)) { struct udp_hslot *hslot; struct udp_table *udptable = sk->sk_prot->h.udp_table; @@ -221,14 +222,15 @@ fail_unlock: fail: return error; } +EXPORT_SYMBOL(udp_lib_get_port); static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return ( !ipv6_only_sock(sk2) && - (!inet1->rcv_saddr || !inet2->rcv_saddr || - inet1->rcv_saddr == inet2->rcv_saddr )); + return (!ipv6_only_sock(sk2) && + (!inet1->rcv_saddr || !inet2->rcv_saddr || + inet1->rcv_saddr == inet2->rcv_saddr)); } int udp_v4_get_port(struct sock *sk, unsigned short snum) @@ -383,8 +385,8 @@ found: void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; - struct iphdr *iph = (struct iphdr*)skb->data; - struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); + struct iphdr *iph = (struct iphdr *)skb->data; + struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -439,7 +441,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { - ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); } sk->sk_err = err; sk->sk_error_report(sk); @@ -474,7 +476,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * (checksum field must be zeroed out) */ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len ) + __be32 src, __be32 dst, int len) { unsigned int offset; struct udphdr *uh = udp_hdr(skb); @@ -545,7 +547,7 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len); + udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); goto send; } else /* `normal' UDP */ @@ -553,18 +555,24 @@ static int udp_push_pending_frames(struct sock *sk) /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, - sk->sk_protocol, csum ); + sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip_push_pending_frames(sk); + if (err) { + if (err == -ENOBUFS && !inet->recverr) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); out: up->len = 0; up->pending = 0; - if (!err) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -592,7 +600,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Check the flags. */ - if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; ipc.opt = NULL; @@ -619,7 +627,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -684,7 +692,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (connected) - rt = (struct rtable*)sk_dst_check(sk, 0); + rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { struct flowi fl = { .oif = ipc.oif, @@ -782,6 +790,7 @@ do_confirm: err = 0; goto out; } +EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) @@ -871,6 +880,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return 0; } +EXPORT_SYMBOL(udp_ioctl); /* * This should be easy, if there is something there we @@ -892,7 +902,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Check any passed addresses */ if (addr_len) - *addr_len=sizeof(*sin); + *addr_len = sizeof(*sin); if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); @@ -923,9 +933,11 @@ try_again: if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + err = skb_copy_and_csum_datagram_iovec(skb, + sizeof(struct udphdr), + msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; @@ -941,8 +953,7 @@ try_again: sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ - if (sin) - { + if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; @@ -995,6 +1006,7 @@ int udp_disconnect(struct sock *sk, int flags) sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) { @@ -1044,7 +1056,7 @@ drop: * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; @@ -1214,7 +1226,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -1355,7 +1367,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int err = 0; int is_udplite = IS_UDPLITE(sk); - if (optlen<sizeof(int)) + if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -1426,6 +1438,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) @@ -1453,7 +1466,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, len; - if (get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -1486,10 +1499,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } +EXPORT_SYMBOL(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -1528,9 +1542,9 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) int is_lite = IS_UDPLITE(sk); /* Check for false positives due to checksum errors */ - if ( (mask & POLLRDNORM) && - !(file->f_flags & O_NONBLOCK) && - !(sk->sk_shutdown & RCV_SHUTDOWN)){ + if ((mask & POLLRDNORM) && + !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN)) { struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; @@ -1552,6 +1566,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } +EXPORT_SYMBOL(udp_poll); struct proto udp_prot = { .name = "UDP", @@ -1582,6 +1597,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif }; +EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS @@ -1703,11 +1719,13 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) rc = -ENOMEM; return rc; } +EXPORT_SYMBOL(udp_proc_register); void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { proc_net_remove(net, afinfo->name); } +EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, @@ -1741,7 +1759,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) int len; udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len ,""); + seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } @@ -1816,16 +1834,64 @@ void __init udp_init(void) sysctl_udp_wmem_min = SK_MEM_QUANTUM; } -EXPORT_SYMBOL(udp_disconnect); -EXPORT_SYMBOL(udp_ioctl); -EXPORT_SYMBOL(udp_prot); -EXPORT_SYMBOL(udp_sendmsg); -EXPORT_SYMBOL(udp_lib_getsockopt); -EXPORT_SYMBOL(udp_lib_setsockopt); -EXPORT_SYMBOL(udp_poll); -EXPORT_SYMBOL(udp_lib_get_port); +int udp4_ufo_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + int offset; + __wsum csum; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb->csum_start - skb_headroom(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: + return segs; +} -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(udp_proc_register); -EXPORT_SYMBOL(udp_proc_unregister); -#endif diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0071ee6f441..74fb2eb833e 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -264,6 +264,22 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .fill_dst = xfrm4_fill_dst, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm4_policy_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "xfrm4_gc_thresh", + .data = &xfrm4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; +#endif + static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); @@ -271,12 +287,31 @@ static void __init xfrm4_policy_init(void) static void __exit xfrm4_policy_fini(void) { +#ifdef CONFIG_SYSCTL + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); +#endif xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); } -void __init xfrm4_init(void) +void __init xfrm4_init(int rt_max_size) { xfrm4_state_init(); xfrm4_policy_init(); + /* + * Select a default value for the gc_thresh based on the main route + * table hash size. It seems to me the worst case scenario is when + * we have ipsec operating in transport mode, in which we create a + * dst_entry per socket. The xfrm gc algorithm starts trying to remove + * entries at gc_thresh, and prevents new allocations as 2*gc_thresh + * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. + * That will let us store an ipsec connection per route table entry, + * and start cleaning when were 1/2 full + */ + xfrm4_dst_ops.gc_thresh = rt_max_size/2; +#ifdef CONFIG_SYSCTL + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, + xfrm4_policy_table); +#endif } |