aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig52
-rw-r--r--net/ipv4/af_inet.c28
-rw-r--r--net/ipv4/arp.c13
-rw-r--r--net/ipv4/devinet.c12
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c42
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c12
-rw-r--r--net/ipv4/ip_fragment.c3
-rw-r--r--net/ipv4/ip_gre.c136
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/ipmr.c464
-rw-r--r--net/ipv4/netfilter/Kconfig30
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/arp_tables.c163
-rw-r--r--net/ipv4/netfilter/arptable_filter.c2
-rw-r--r--net/ipv4/netfilter/ip_queue.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c153
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c2
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c97
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c2
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c63
-rw-r--r--net/ipv4/netfilter/iptable_filter.c1
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c1
-rw-r--r--net/ipv4/netfilter/iptable_raw.c1
-rw-r--r--net/ipv4/netfilter/iptable_security.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c4
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c1
-rw-r--r--net/ipv4/route.c15
-rw-r--r--net/ipv4/tcp.c89
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c21
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_input.c198
-rw-r--r--net/ipv4/tcp_ipv4.c13
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c94
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c10
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c15
-rw-r--r--net/ipv4/xfrm4_policy.c2
52 files changed, 1027 insertions, 837 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a35..b2cf91e4cca 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
at boot time after the /proc file system has been mounted.
- If you turn on IP forwarding, you will also get the rp_filter, which
+ If you turn on IP forwarding, you should consider the rp_filter, which
automatically rejects incoming packets if the routing table entry
for their source address doesn't match the network interface they're
arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- or
+ and
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
If unsure, say N here.
-choice
+choice
prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
depends on IP_ADVANCED_ROUTER
default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
config ASK_IP_FIB_HASH
bool "FIB_HASH"
---help---
- Current FIB is very proven and good enough for most users.
+ Current FIB is very proven and good enough for most users.
config IP_FIB_TRIE
bool "FIB_TRIE"
---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-
+ Use new experimental LC-trie as FIB lookup algorithm.
+ This improves lookup performance if you have a large
+ number of routes.
+
+ LC-trie is a longest matching prefix lookup algorithm which
+ performs better than FIB_HASH for large routing tables.
+ But, it consumes more memory and is more complex.
+
+ LC-trie is described in:
+
+ IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
+ June 1999
+
+ An experimental study of compression methods for dynamic tries
+ Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+
endchoice
config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
<file:Documentation/filesystems/nfsroot.txt> for details.
# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
+# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
---help---
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
-
+
If unsure, say Y.
config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
-
+
If unsure, say Y.
config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3a3dad80135..7f03373b8c0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -369,7 +369,6 @@ lookup_protocol:
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
- sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
@@ -1253,10 +1252,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
int proto;
int id;
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ iph = skb_gro_header(skb, sizeof(*iph));
+ if (unlikely(!iph))
goto out;
- iph = ip_hdr(skb);
proto = iph->protocol & (MAX_INET_PROTOS - 1);
rcu_read_lock();
@@ -1264,13 +1263,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!ops || !ops->gro_receive)
goto out_unlock;
- if (iph->version != 4 || iph->ihl != 5)
+ if (*(u8 *)iph != 0x45)
goto out_unlock;
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- flush = ntohs(iph->tot_len) != skb->len ||
+ flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
iph->frag_off != htons(IP_DF);
id = ntohs(iph->id);
@@ -1282,24 +1281,25 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
iph2 = ip_hdr(p);
- if (iph->protocol != iph2->protocol ||
- iph->tos != iph2->tos ||
- memcmp(&iph->saddr, &iph2->saddr, 8)) {
+ if ((iph->protocol ^ iph2->protocol) |
+ (iph->tos ^ iph2->tos) |
+ (iph->saddr ^ iph2->saddr) |
+ (iph->daddr ^ iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
- memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
- (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
+ (iph->ttl ^ iph2->ttl) |
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
- __skb_pull(skb, sizeof(*iph));
- skb_reset_transport_header(skb);
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
pp = ops->gro_receive(head, skb);
@@ -1500,8 +1500,8 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 29a74c01d8d..f11931c1838 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
* cache.
*/
- /* Special case: IPv4 duplicate address detection packet (RFC2131) */
- if (sip == 0) {
+ /*
+ * Special case: IPv4 duplicate address detection packet (RFC2131)
+ * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
+ */
+ if (sip == 0 || tip == sip) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type(net, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
out:
if (in_dev)
in_dev_put(in_dev);
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
@@ -1225,8 +1228,8 @@ void arp_ifdown(struct net_device *dev)
* Called once on startup.
*/
-static struct packet_type arp_packet_type = {
- .type = __constant_htons(ETH_P_ARP),
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 309997edc8a..126bb911880 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
}
ip_mc_up(in_dev);
+ /* fall through */
+ case NETDEV_CHANGEADDR:
+ if (IN_DEV_ARP_NOTIFY(in_dev))
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ in_dev->ifa_list->ifa_address,
+ dev,
+ in_dev->ifa_list->ifa_address,
+ NULL, dev->dev_addr, NULL);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
@@ -1208,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
@@ -1439,6 +1448,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e47..cafcc49d099 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
fib_res_put(&res);
if (no_addr)
goto last_resort;
- if (rpf)
+ if (rpf == 1)
goto e_inval;
fl.oif = dev->ifindex;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc7..f831df50090 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
- info->nlh, GFP_KERNEL);
+ rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 705b33b184a..3f50807237e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -375,6 +375,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
+ ipc.shtx.flags = 0;
if (icmp_param->replyopts.optlen) {
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
@@ -532,6 +533,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
inet_sk(sk)->tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts;
+ ipc.shtx.flags = 0;
{
struct flowi fl = {
@@ -1205,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
int __init icmp_init(void)
{
- return register_pernet_device(&icmp_sk_ops);
+ return register_pernet_subsys(&icmp_sk_ops);
}
EXPORT_SYMBOL(icmp_err_convert);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f26ab38680d..22cd19ee44e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -93,24 +93,40 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct inet_bind_hashbucket *head;
struct hlist_node *node;
struct inet_bind_bucket *tb;
- int ret;
+ int ret, attempts = 5;
struct net *net = sock_net(sk);
+ int smallest_size = -1, smallest_rover;
local_bh_disable();
if (!snum) {
int remaining, rover, low, high;
+again:
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
- rover = net_random() % remaining + low;
+ smallest_rover = rover = net_random() % remaining + low;
+ smallest_size = -1;
do {
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (ib_net(tb) == net && tb->port == rover)
+ if (ib_net(tb) == net && tb->port == rover) {
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_rover = rover;
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
+ spin_unlock(&head->lock);
+ snum = smallest_rover;
+ goto have_snum;
+ }
+ }
goto next;
+ }
break;
next:
spin_unlock(&head->lock);
@@ -125,14 +141,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
* the top level, not from the 'break;' statement.
*/
ret = 1;
- if (remaining <= 0)
+ if (remaining <= 0) {
+ if (smallest_size != -1) {
+ snum = smallest_rover;
+ goto have_snum;
+ }
goto fail;
-
+ }
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
} else {
+have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
@@ -145,12 +166,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
tb_found:
if (!hlist_empty(&tb->owners)) {
if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ smallest_size == -1) {
goto success;
} else {
ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock(&head->lock);
+ goto again;
+ }
goto fail_unlock;
+ }
}
}
tb_not_found:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786..eaf3e2c8646 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
+ __releases(&f->lock)
{
struct inet_frag_queue *q;
struct hlist_node *n;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6a1045da48d..625cc5f64c9 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -59,8 +60,13 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ atomic_inc(&hashinfo->bsockets);
+
inet_sk(sk)->num = snum;
sk_add_bind_node(sk, &tb->owners);
+ tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
@@ -75,9 +81,12 @@ static void __inet_put_port(struct sock *sk)
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
+ atomic_dec(&hashinfo->bsockets);
+
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
+ tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -444,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (ib_net(tb) == net && tb->port == port) {
- WARN_ON(hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
goto next_port;
+ WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
@@ -523,6 +532,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;
+ atomic_set(&h->bsockets, 0);
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000ee..7985346653b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
iph = ip_hdr(head);
iph->frag_off = 0;
iph->tot_len = htons(len);
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0101521f366..e62510d5ea5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -164,67 +164,124 @@ static DEFINE_RWLOCK(ipgre_lock);
/* Given src, dst and key, find appropriate for input tunnel. */
-static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
+static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
__be32 remote, __be32 local,
__be32 key, __be16 gre_proto)
{
+ struct net *net = dev_net(dev);
+ int link = dev->ifindex;
unsigned h0 = HASH(remote);
unsigned h1 = HASH(key);
- struct ip_tunnel *t;
- struct ip_tunnel *t2 = NULL;
+ struct ip_tunnel *t, *cand = NULL;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
ARPHRD_ETHER : ARPHRD_IPGRE;
+ int score, cand_score = 4;
for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
- if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
- if (t->dev->type == dev_type)
- return t;
- if (t->dev->type == ARPHRD_IPGRE && !t2)
- t2 = t;
- }
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IPGRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
}
}
for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
- if (remote == t->parms.iph.daddr) {
- if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
- if (t->dev->type == dev_type)
- return t;
- if (t->dev->type == ARPHRD_IPGRE && !t2)
- t2 = t;
- }
+ if (remote != t->parms.iph.daddr ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IPGRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
}
}
for (t = ign->tunnels_l[h1]; t; t = t->next) {
- if (local == t->parms.iph.saddr ||
- (local == t->parms.iph.daddr &&
- ipv4_is_multicast(local))) {
- if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
- if (t->dev->type == dev_type)
- return t;
- if (t->dev->type == ARPHRD_IPGRE && !t2)
- t2 = t;
- }
+ if ((local != t->parms.iph.saddr &&
+ (local != t->parms.iph.daddr ||
+ !ipv4_is_multicast(local))) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IPGRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if