diff options
Diffstat (limited to 'net/ipv4')
52 files changed, 1027 insertions, 837 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 691268f3a35..b2cf91e4cca 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER at boot time after the /proc file system has been mounted. - If you turn on IP forwarding, you will also get the rp_filter, which + If you turn on IP forwarding, you should consider the rp_filter, which automatically rejects incoming packets if the routing table entry for their source address doesn't match the network interface they're arriving on. This has security advantages because it prevents the @@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter - or + and echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter + Note that some distributions enable it in startup scripts. + For details about rp_filter strict and loose mode read + <file:Documentation/networking/ip-sysctl.txt>. + If unsure, say N here. -choice +choice prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" depends on IP_ADVANCED_ROUTER default ASK_IP_FIB_HASH @@ -59,27 +63,29 @@ choice config ASK_IP_FIB_HASH bool "FIB_HASH" ---help--- - Current FIB is very proven and good enough for most users. + Current FIB is very proven and good enough for most users. config IP_FIB_TRIE bool "FIB_TRIE" ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ - + Use new experimental LC-trie as FIB lookup algorithm. + This improves lookup performance if you have a large + number of routes. + + LC-trie is a longest matching prefix lookup algorithm which + performs better than FIB_HASH for large routing tables. + But, it consumes more memory and is more complex. + + LC-trie is described in: + + IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, + June 1999 + + An experimental study of compression methods for dynamic tries + Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + endchoice config IP_FIB_HASH @@ -191,7 +197,7 @@ config IP_PNP_RARP <file:Documentation/filesystems/nfsroot.txt> for details. # not yet ready.. -# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL @@ -361,7 +367,7 @@ config INET_IPCOMP ---help--- Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. - + If unsure, say Y. config INET_XFRM_TUNNEL @@ -415,7 +421,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. - + If unsure, say Y. config INET_TCP_DIAG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3a3dad80135..7f03373b8c0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -369,7 +369,6 @@ lookup_protocol: sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; - sk->sk_family = PF_INET; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@ -1253,10 +1252,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, int proto; int id; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); rcu_read_lock(); @@ -1264,13 +1263,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!ops || !ops->gro_receive) goto out_unlock; - if (iph->version != 4 || iph->ihl != 5) + if (*(u8 *)iph != 0x45) goto out_unlock; if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - flush = ntohs(iph->tot_len) != skb->len || + flush = ntohs(iph->tot_len) != skb_gro_len(skb) || iph->frag_off != htons(IP_DF); id = ntohs(iph->id); @@ -1282,24 +1281,25 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, iph2 = ip_hdr(p); - if (iph->protocol != iph2->protocol || - iph->tos != iph2->tos || - memcmp(&iph->saddr, &iph2->saddr, 8)) { + if ((iph->protocol ^ iph2->protocol) | + (iph->tos ^ iph2->tos) | + (iph->saddr ^ iph2->saddr) | + (iph->daddr ^ iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } /* All fields must match except length and checksum. */ NAPI_GRO_CB(p)->flush |= - memcmp(&iph->frag_off, &iph2->frag_off, 4) || - (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id; + (iph->ttl ^ iph2->ttl) | + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } NAPI_GRO_CB(skb)->flush |= flush; - __skb_pull(skb, sizeof(*iph)); - skb_reset_transport_header(skb); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); pp = ops->gro_receive(head, skb); @@ -1500,8 +1500,8 @@ static int ipv4_proc_init(void); * IP protocol layer initialiser */ -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), +static struct packet_type ip_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 29a74c01d8d..f11931c1838 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb) * cache. */ - /* Special case: IPv4 duplicate address detection packet (RFC2131) */ - if (sip == 0) { + /* + * Special case: IPv4 duplicate address detection packet (RFC2131) + * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4) + */ + if (sip == 0 || tip == sip) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type(net, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) @@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb) out: if (in_dev) in_dev_put(in_dev); - kfree_skb(skb); + consume_skb(skb); return 0; } @@ -1225,8 +1228,8 @@ void arp_ifdown(struct net_device *dev) * Called once on startup. */ -static struct packet_type arp_packet_type = { - .type = __constant_htons(ETH_P_ARP), +static struct packet_type arp_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 309997edc8a..126bb911880 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } } ip_mc_up(in_dev); + /* fall through */ + case NETDEV_CHANGEADDR: + if (IN_DEV_ARP_NOTIFY(in_dev)) + arp_send(ARPOP_REQUEST, ETH_P_ARP, + in_dev->ifa_list->ifa_address, + dev, + in_dev->ifa_list->ifa_address, + NULL, dev->dev_addr, NULL); break; case NETDEV_DOWN: ip_mc_down(in_dev); @@ -1208,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); @@ -1439,6 +1448,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), + DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 741e4fa3e47..cafcc49d099 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fib_res_put(&res); if (no_addr) goto last_resort; - if (rpf) + if (rpf == 1) goto e_inval; fl.oif = dev->ifindex; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4817dea3bc7..f831df50090 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, - info->nlh, GFP_KERNEL); + rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + info->nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 705b33b184a..3f50807237e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -375,6 +375,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; + ipc.shtx.flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -532,6 +533,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; + ipc.shtx.flags = 0; { struct flowi fl = { @@ -1205,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = { int __init icmp_init(void) { - return register_pernet_device(&icmp_sk_ops); + return register_pernet_subsys(&icmp_sk_ops); } EXPORT_SYMBOL(icmp_err_convert); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f26ab38680d..22cd19ee44e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -93,24 +93,40 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct inet_bind_hashbucket *head; struct hlist_node *node; struct inet_bind_bucket *tb; - int ret; + int ret, attempts = 5; struct net *net = sock_net(sk); + int smallest_size = -1, smallest_rover; local_bh_disable(); if (!snum) { int remaining, rover, low, high; +again: inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + smallest_rover = rover = net_random() % remaining + low; + smallest_size = -1; do { head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == rover) + if (ib_net(tb) == net && tb->port == rover) { + if (tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_rover = rover; + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; + } + } goto next; + } break; next: spin_unlock(&head->lock); @@ -125,14 +141,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) * the top level, not from the 'break;' statement. */ ret = 1; - if (remaining <= 0) + if (remaining <= 0) { + if (smallest_size != -1) { + snum = smallest_rover; + goto have_snum; + } goto fail; - + } /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; } else { +have_snum: head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)]; spin_lock(&head->lock); @@ -145,12 +166,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) tb_found: if (!hlist_empty(&tb->owners)) { if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size == -1) { goto success; } else { ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size != -1 && --attempts >= 0) { + spin_unlock(&head->lock); + goto again; + } goto fail_unlock; + } } } tb_not_found: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6c52e08f786..eaf3e2c8646 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) + __releases(&f->lock) { struct inet_frag_queue *q; struct hlist_node *n; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6a1045da48d..625cc5f64c9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, write_pnet(&tb->ib_net, hold_net(net)); tb->port = snum; tb->fastreuse = 0; + tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } @@ -59,8 +60,13 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + atomic_inc(&hashinfo->bsockets); + inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); + tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; } @@ -75,9 +81,12 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; + atomic_dec(&hashinfo->bsockets); + spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); + tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); @@ -444,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, */ inet_bind_bucket_for_each(tb, node, &head->chain) { if (ib_net(tb) == net && tb->port == port) { - WARN_ON(hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; + WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, port, &tw)) goto ok; @@ -523,6 +532,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; + atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 6659ac000ee..7985346653b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -463,6 +463,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev) { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; struct sk_buff *fp, *head = qp->q.fragments; int len; @@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0101521f366..e62510d5ea5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -164,67 +164,124 @@ static DEFINE_RWLOCK(ipgre_lock); /* Given src, dst and key, find appropriate for input tunnel. */ -static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, +static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, __be32 remote, __be32 local, __be32 key, __be16 gre_proto) { + struct net *net = dev_net(dev); + int link = dev->ifindex; unsigned h0 = HASH(remote); unsigned h1 = HASH(key); - struct ip_tunnel *t; - struct ip_tunnel *t2 = NULL; + struct ip_tunnel *t, *cand = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB)) ? ARPHRD_ETHER : ARPHRD_IPGRE; + int score, cand_score = 4; for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if (local != t->parms.iph.saddr || + remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { - if (remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if (remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } for (t = ign->tunnels_l[h1]; t; t = t->next) { - if (local == t->parms.iph.saddr || - (local == t->parms.iph.daddr && - ipv4_is_multicast(local))) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if ((local != t->parms.iph.saddr && + (local != t->parms.iph.daddr || + !ipv4_is_multicast(local))) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if |