aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/arp.c19
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c192
-rw-r--r--net/ipv4/fib_hash.c291
-rw-r--r--net/ipv4/fib_lookup.h11
-rw-r--r--net/ipv4/fib_rules.c13
-rw-r--r--net/ipv4/fib_semantics.c297
-rw-r--r--net/ipv4/fib_trie.c29
-rw-r--r--net/ipv4/gre.c2
-rw-r--r--net/ipv4/igmp.c34
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_gre.c231
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ipip.c210
-rw-r--r--net/ipv4/ipmr.c428
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/route.c183
-rw-r--r--net/ipv4/tcp.c9
-rw-r--r--net/ipv4/tcp_input.c43
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c8
-rw-r--r--net/ipv4/tcp_timer.c38
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_state.c33
33 files changed, 1207 insertions, 944 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5462e2d147a..e848e6c062c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -223,7 +223,7 @@ config NET_IPGRE_DEMUX
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
- depends on NET_IPGRE_DEMUX
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -419,7 +419,7 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
config INET_LRO
- bool "Large Receive Offload (ipv4/tcp)"
+ tristate "Large Receive Offload (ipv4/tcp)"
default y
---help---
Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index dcfe7e961c1..d8e540c5b07 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
.queue_xmit = dev_queue_xmit,
};
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
@@ -170,7 +170,6 @@ const struct neigh_ops arp_broken_ops = {
.hh_output = dev_queue_xmit,
.queue_xmit = dev_queue_xmit,
};
-EXPORT_SYMBOL(arp_broken_ops);
struct neigh_table arp_tbl = {
.family = AF_INET,
@@ -226,9 +225,11 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
}
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+ return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
@@ -501,10 +502,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
if (n) {
n->used = jiffies;
- if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
+ if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+ neigh_ha_snapshot(haddr, n, dev);
neigh_release(n);
return 0;
}
@@ -567,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- return (omi != imi && omi != -1);
+ return omi != imi && omi != -1;
}
/*
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45..174be6caa5c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
- return(0);
+ return 0;
}
EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f..dc94b0316b7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return inet_insert_ifa(ifa);
}
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
- in_dev = in_dev_get(dev);
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
rcu_read_unlock();
return in_dev;
}
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto errout;
}
- __in_dev_put(in_dev);
-
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
printk(KERN_DEBUG "inetdev_event: bug\n");
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999f..36e27c2107d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,35 +147,43 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/*
- * Find the first device with a given source address.
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
*/
-
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
- struct fib_result res;
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = addr
+ }
+ },
+ .flags = FLOWI_FLAG_MATCH_ANY_IIF
+ };
+ struct fib_result res = { 0 };
struct net_device *dev = NULL;
- struct fib_table *local_table;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table || fib_table_lookup(local_table, &fl, &res))
+ rcu_read_lock();
+ if (fib_lookup(net, &fl, &res)) {
+ rcu_read_unlock();
return NULL;
+ }
if (res.type != RTN_LOCAL)
goto out;
dev = FIB_RES_DEV(res);
- if (dev)
+ if (dev && devref)
dev_hold(dev);
out:
- fib_res_put(&res);
+ rcu_read_unlock();
return dev;
}
-EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(__ip_dev_find);
/*
* Find address type as if only "dev" was present in the system. If
@@ -202,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
- if (!fib_table_lookup(local_table, &fl, &res)) {
+ rcu_read_lock();
+ if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
- fib_res_put(&res);
}
+ rcu_read_unlock();
}
return ret;
}
@@ -220,30 +229,34 @@ EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ return __inet_dev_addr_type(net, dev, addr);
}
EXPORT_SYMBOL(inet_dev_addr_type);
/* Given (packet source, input interface) and optional (dst, oif, tos):
- - (main) check, that source is valid i.e. not broadcast or our local
- address.
- - figure out what "logical" interface this packet arrived
- and calculate "specific destination" address.
- - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
*/
-
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
struct net_device *dev, __be32 *spec_dst,
u32 *itag, u32 mark)
{
struct in_device *in_dev;
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = src,
- .saddr = dst,
- .tos = tos } },
- .mark = mark,
- .iif = oif };
-
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = src,
+ .saddr = dst,
+ .tos = tos
+ }
+ },
+ .mark = mark,
+ .iif = oif
+ };
struct fib_result res;
int no_addr, rpf, accept_local;
bool dev_match;
@@ -251,7 +264,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
struct net *net;
no_addr = rpf = accept_local = 0;
- rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
@@ -260,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (mark && !IN_DEV_SRC_VMARK(in_dev))
fl.mark = 0;
}
- rcu_read_unlock();
if (in_dev == NULL)
goto e_inval;
@@ -270,7 +281,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
goto last_resort;
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
- goto e_inval_res;
+ goto e_inval;
}
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
@@ -291,10 +302,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
#endif
if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- fib_res_put(&res);
return ret;
}
- fib_res_put(&res);
if (no_addr)
goto last_resort;
if (rpf == 1)
@@ -307,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
*spec_dst = FIB_RES_PREFSRC(res);
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
- fib_res_put(&res);
}
return ret;
@@ -318,8 +326,6 @@ last_resort:
*itag = 0;
return 0;
-e_inval_res:
- fib_res_put(&res);
e_inval:
return -EINVAL;
e_rpf:
@@ -472,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
}
/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
*/
-
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct fib_config cfg;
@@ -518,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
return -EINVAL;
}
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -532,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_config *cfg)
+ struct nlmsghdr *nlh, struct fib_config *cfg)
{
struct nlattr *attr;
int err, remaining;
@@ -687,12 +693,11 @@ out:
}
/* Prepare and feed intra-kernel routing request.
- Really, it should be netlink message, but :-( netlink
- can be not configured, so that we feed it directly
- to fib engine. It is legal, because all events occur
- only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
*/
-
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -738,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
- __be32 prefix = ifa->ifa_address&mask;
+ __be32 prefix = ifa->ifa_address & mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -750,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
}
}
}
@@ -776,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
struct in_ifaddr *prim = ifa;
- __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
- __be32 any = ifa->ifa_address&ifa->ifa_mask;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
unsigned ok = 0;
- if (!(ifa->ifa_flags&IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
else {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
@@ -796,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
}
/* Deletion is more complicated than add.
- We should take care of not to delete too much :-)
-
- Scan address list to be sure that addresses are really gone.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
*/
for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -812,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD0_OK;
}
- if (!(ok&BRD_OK))
+ if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok&BRD1_OK))
+ if (!(ok & BRD1_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok&BRD0_OK))
+ if (!(ok & BRD0_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
- if (!(ok&LOCAL_OK)) {
+ if (!(ok & LOCAL_OK)) {
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
- We must flush stray FIB entries.
-
- First of all, we scan fib_info list searching
- for stray nexthop entries, then ignite fib_flush.
- */
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
fib_flush(dev_net(dev));
}
@@ -839,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = { .mark = frn->fl_mark,
- .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
- .tos = frn->fl_tos,
- .scope = frn->fl_scope } } };
+ struct flowi fl = {
+ .mark = frn->fl_mark,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = frn->fl_addr,
+ .tos = frn->fl_tos,
+ .scope = frn->fl_scope
+ }
+ }
+ };
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
@@ -857,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = fib_table_lookup(tb, &fl, &res);
+ rcu_read_lock();
+ frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
frn->nh_sel = res.nh_sel;
frn->type = res.type;
frn->scope = res.scope;
- fib_res_put(&res);
}
+ rcu_read_unlock();
local_bh_enable();
}
}
@@ -894,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
nl_fib_lookup(frn, tb);
- pid = NETLINK_CB(skb).pid; /* pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ pid = NETLINK_CB(skb).pid; /* pid of sending process */
+ NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
}
@@ -942,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_del_ifaddr(ifa);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
- Disable IP.
+ * Disable IP.
*/
fib_disable_ip(dev, 1, 0);
} else {
@@ -1001,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
static int __net_init ip_fib_net_init(struct net *net)
{
int err;
- unsigned int i;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- net->ipv4.fib_table_hash = kzalloc(
- sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
if (net->ipv4.fib_table_hash == NULL)
return -ENOMEM;
- for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
-
err = fib4_rules_init(net);
if (err < 0)
goto fail;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1b..43e1c594ce8 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
struct fib_alias fn_embedded_alias;
};
-struct fn_zone {
- struct fn_zone *fz_next; /* Next not empty zone */
- struct hlist_head *fz_hash; /* Hash table pointer */
- int fz_nent; /* Number of entries */
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
- int fz_divisor; /* Hash divisor */
+struct fn_zone {
+ struct fn_zone __rcu *fz_next; /* Next not empty zone */
+ struct hlist_head __rcu *fz_hash; /* Hash table pointer */
+ seqlock_t fz_lock;
u32 fz_hashmask; /* (fz_divisor - 1) */
-#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
- int fz_order; /* Zone order */
- __be32 fz_mask;
+ u8 fz_order; /* Zone order (0..32) */
+ u8 fz_revorder; /* 32 - fz_order */
+ __be32 fz_mask; /* inet_make_mask(order) */
#define FZ_MASK(fz) ((fz)->fz_mask)
-};
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+ struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+ int fz_nent; /* Number of entries */
+ int fz_divisor; /* Hash size (mask+1) */
+};
struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone *fn_zone_list;
+ struct fn_zone *fn_zones[33];
+ struct fn_zone __rcu *fn_zone_list;
};
static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
{
- u32 h = ntohl(key)>>(32 - fz->fz_order);
+ u32 h = ntohl(key) >> fz->fz_revorder;
h ^= (h>>20);
h ^= (h>>10);
h ^= (h>>5);
- h &= FZ_HASHMASK(fz);
+ h &= fz->fz_hashmask;
return h;
}
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
return dst & FZ_MASK(fz);
}
-static DEFINE_RWLOCK(fib_hash_lock);
static unsigned int fib_hash_genid;
#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
{
unsigned long size = divisor * sizeof(struct hlist_head);
- if (size <= PAGE_SIZE) {
+ if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- } else {
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
- }
+
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
}
/* The fib hash lock must be held when this is called. */
@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
struct fib_node *f;
hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
+ struct hlist_head __rcu *new_head;
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, new_head);
+ hlist_add_head_rcu(&f->fn_hash, new_head);
}
}
}
@@ -147,14 +146,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
int old_divisor, new_divisor;
u32 new_hashmask;
- old_divisor = fz->fz_divisor;
+ new_divisor = old_divisor = fz->fz_divisor;
switch (old_divisor) {
- case 16:
- new_divisor = 256;
+ case EMBEDDED_HASH_SIZE:
+ new_divisor *= EMBEDDED_HASH_SIZE;
break;
- case 256:
- new_divisor = 1024;
+ case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+ new_divisor *= (EMBEDDED_HASH_SIZE/2);
break;
default:
if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
ht = fz_hash_alloc(new_divisor);
if (ht) {
- write_lock_bh(&fib_hash_lock);
+ struct fn_zone nfz;
+
+ memcpy(&nfz, fz, sizeof(nfz));
+
+ write_seqlock_bh(&fz->fz_lock);
old_ht = fz->fz_hash;
- fz->fz_hash = ht;
+ nfz.fz_hash = ht;
+ nfz.fz_hashmask = new_hashmask;
+ nfz.fz_divisor = new_divisor;
+ fn_rebuild_zone(&nfz, old_ht, old_divisor);
+ fib_hash_genid++;
+ rcu_assign_pointer(fz->fz_hash, ht);
fz->fz_hashmask = new_hashmask;
fz->fz_divisor = new_divisor;
- fn_rebuild_zone(fz, old_ht, old_divisor);
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ write_sequnlock_bh(&fz->fz_lock);
- fz_hash_free(old_ht, old_divisor);
+ if (old_ht != fz->fz_embedded_hash) {
+ synchronize_rcu();
+ fz_hash_free(old_ht, old_divisor);
+ }
}
}
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
{
+ struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
kmem_cache_free(fn_hash_kmem, f);
}
+static inline void fn_free_node(struct fib_node *f)
+{
+ call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
{
fib_release_info(fa->fa_info);
if (fa == &f->fn_embedded_alias)
fa->fa_info = NULL;
else
- kmem_cache_free(fn_alias_kmem, fa);
+ call_rcu(&fa->rcu, fn_free_alias_rcu);
}
static struct fn_zone *
@@ -210,68 +233,71 @@ fn_new_zone(struct fn_hash *table, int z)
if (!fz)
return NULL;
- if (z) {
- fz->fz_divisor = 16;
- } else {
- fz->fz_divisor = 1;
- }
- fz->fz_hashmask = (fz->fz_divisor - 1);
- fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
- if (!fz->fz_hash) {
- kfree(fz);
- return NULL;
- }
+ seqlock_init(&fz->fz_lock);
+ fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+ fz->fz_hashmask = fz->fz_divisor - 1;
+ fz->fz_hash = fz->fz_embedded_hash;
fz->fz_order = z;
+ fz->fz_revorder = 32 - z;
fz->fz_mask = inet_make_mask(z);
/* Find the first not empty zone with more specific mask */
- for (i=z+1; i<=32; i++)
+ for (i = z + 1; i <= 32; i++)
if (table->fn_zones[i])
break;
- write_lock_bh(&fib_hash_lock);
- if (i>32) {
+ if (i > 32) {
/* No more specific masks, we are the first. */
- fz->fz_next = table->fn_zone_list;
- table->fn_zone_list = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zone_list));
+ rcu_assign_pointer(table->fn_zone_list, fz);
} else {
- fz->fz_next = table->fn_zones[i]->fz_next;
- table->fn_zones[i]->fz_next = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zones[i]->fz_next));
+ rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
}
table->fn_zones[z] = fz;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
return fz;
}
int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
+ const struct flowi *flp, struct fib_result *res,
+ int fib_flags)
{
int err;
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- read_lock(&fib_hash_lock);
- for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
- struct hlist_head *head;
+ rcu_read_lock();
+ for (fz = rcu_dereference(t->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next)) {
+ struct hlist_head __rcu *head;
struct hlist_node *node;
struct fib_node *f;
- __be32 k = fz_key(flp->fl4_dst, fz);
+ __be32 k;
+ unsigned int seq;
- head = &fz->fz_hash[fn_hash(k, fz)];
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
+ do {
+ seq = read_seqbegin(&fz->fz_lock);
+ k = fz_key(flp->fl4_dst, fz);
+
+ head = &fz->fz_hash[fn_hash(k, fz)];
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+ if (f->fn_key != k)
+ continue;
- err = fib_semantic_match(&f->fn_alias,
+ err = fib_semantic_match(&f->fn_alias,
flp, res,
- fz->fz_order);
- if (err <= 0)
- goto out;
- }
+ fz->fz_order, fib_flags);
+ if (err <= 0)
+ goto out;
+ }
+ } while (read_seqretry(&fz->fz_lock, seq));
}
err = 1;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return err;
}
@@ -293,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
last_resort = NULL;
order = -1;
- read_lock(&fib_hash_lock);
- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
struct fib_info *next_fi = fa->fa_info;
if (fa->fa_scope != res->scope ||
@@ -309,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
@@ -341,7 +368,7 @@ void fib_table_select_default(struct fib_table *tb,
fib_result_assign(res, last_resort);
tb->tb_default = last_idx;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
/* Insert node F to FZ. */
@@ -349,7 +376,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
{
struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, head);
+ hlist_add_head_rcu(&f->fn_hash, head);
}
/* Return the node in FZ matching KEY. */
@@ -359,7 +386,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
struct hlist_node *node;
struct fib_node *f;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key == key)
return f;
}
@@ -367,6 +394,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
return NULL;
}
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+ struct fib_alias *fa = &f->fn_embedded_alias;
+
+ if (fa->fa_info != NULL)
+ fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ return fa;
+}
+
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +489,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
if (cfg->fc_nlflags & NLM_F_REPLACE) {
- struct fib_info *fi_drop;
u8 state;
fa = fa_first;
@@ -460,21 +497,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = 0;
goto out;
}
- write_lock_bh(&fib_hash_lock);
- fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = cfg->fc_type;
- fa->fa_scope = cfg->fc_scope;
+ err = -ENOBUFS;
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- fib_release_info(fi_drop);
+ fn_free_alias(fa, f);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, NLM_F_REPLACE);
+ rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
return 0;
}
@@ -506,12 +547,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
f = new_f;
}
- new_fa = &f->fn_embedded_alias;
- if (new_fa->fa_info != NULL) {
- new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
- goto out;
- }
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
@@ -522,13 +561,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
* Insert new entry to the list.
*/
- write_lock_bh(&fib_hash_lock);
if (new_f)
fib_insert_node(fz, new_f);
- list_add_tail(&new_fa->fa_list,
+ list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : &f->fn_alias));
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (new_f)
fz->fz_nent++;
@@ -603,14 +640,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
tb->tb_id, &cfg->fc_nlinfo, 0);
kill_fn = 0;
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_fn = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -641,14 +676,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_f = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
fn_free_alias(fa, f);
found++;
@@ -662,13 +695,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
return found;
}
+/* caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
struct fn_zone *fz;
int found = 0;
- for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ for (fz = rtnl_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rtnl_dereference(fz->fz_next)) {
int i;
for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -690,10 +726,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
s_i = cb->args[4];
i = 0;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
if (i < s_i)
goto next;
@@ -711,7 +747,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
cb->args[4] = i;
return -1;
}
- next:
+next:
i++;
}
}
@@ -746,23 +782,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- int m, s_m;
+ int m = 0, s_m;
struct fn_zone *fz;
struct fn_hash *table = (struct fn_hash *)tb->tb_data;
s_m = cb->args[2];
- read_lock(&fib_hash_lock);
- for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
- if (m < s_m) continue;
+ rcu_read_lock();
+ for (fz = rcu_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next), m++) {
+ if (m < s_m)
+ continue;
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[2] = m;
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return -1;
}
memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
cb->args[2] = m;
return skb->len;
}
@@ -825,8 +864,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
iter->genid = fib_hash_genid;
iter->valid = 1;
- for (iter->zone = table->fn_zone_list; iter->zone;
- iter->zone = iter->zone->fz_next) {
+ for (iter->zone = rcu_dereference(table->fn_zone_list);
+ iter->zone != NULL;
+ iter->zone = rcu_dereference(iter->zone->fz_next)) {
int maxslot;
if (!iter->zone->fz_nent)
@@ -911,7 +951,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
}
}
- iter->zone = iter->zone->fz_next;
+ iter->zone = rcu_dereference(iter->zone->fz_next);
if (!iter->zone)
goto out;
@@ -950,11 +990,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
}
static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(fib_hash_lock)
+ __acquires(RCU)
{
void *v = NULL;
- read_lock(&fib_hash_lock);
+ rcu_read_lock();
if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v;
@@ -967,15 +1007,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(fib_hash_lock)
+ __releases(RCU)
{
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
+ [7] = RTF_REJECT,
+ [8] = RTF_REJECT,
};
unsigned flags = type2flags[type];
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973b..a29edf2219c 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
u8 fa_type;
u8 fa_scope;
u8 fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
struct rcu_head rcu;
-#endif
};
#define FA_S_ACCESSED 0x01
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
/* Exported by fib_semantics.c */
extern int fib_semantic_match(struct list_head *head,
const struct flowi *flp,
- struct fib_result *res, int prefixlen);
+ struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
extern struct fib_info *fib_create_info(struct fib_config *cfg);
extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff56..7981a24f5c7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
* IPv4 Forwarding Information Base: policy rules.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Thomas Graf <tgraf@suug.ch>
+ * Thomas Graf <tgraf@suug.ch>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
* 2 of the License, or (at your option) any later version.
*
* Fixes:
- * Rani Assaf : local_rule cannot be deleted
+ * Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
*/
@@ -32,8 +32,7 @@
#include <net/ip_fib.h>
#include <net/fib_rules.h>
-struct fib4_rule
-{
+struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
+ .flags = FIB_LOOKUP_NOREF,
};
int err;
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
goto errout;
}
- if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (!tbl)
goto errout;
- err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+ err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e..3e0da3ef611 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
static DEFINE_SPINLOCK(fib_multipath_lock);
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
/* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -86,63 +95,70 @@ static const struct
int error;
u8 scope;
} fib_props[RTN_MAX + 1] = {
- {
+ [RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_UNSPEC */
- {
+ },
+ [RTN_UNICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNICAST */
- {
+ },
+ [RTN_LOCAL] = {
.error = 0,
.scope = RT_SCOPE_HOST,
- }, /* RTN_LOCAL */
- {
+ },
+ [RTN_BROADCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_BROADCAST */
- {
+ },
+ [RTN_ANYCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_ANYCAST */
- {
+ },
+ [RTN_MULTICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_MULTICAST */
- {
+ },
+ [RTN_BLACKHOLE] = {
.error = -EINVAL,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_BLACKHOLE */
- {
+ },
+ [RTN_UNREACHABLE] = {
.error = -EHOSTUNREACH,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNREACHABLE */
- {
+ },
+ [RTN_PROHIBIT] = {
.error = -EACCES,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_PROHIBIT */
- {
+ },
+ [RTN_THROW] = {
.error = -EAGAIN,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_THROW */
- {
+ },
+ [RTN_NAT] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_NAT */
- {
+ },
+ [RTN_XRESOLVE] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_XRESOLVE */
+ },
};
/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ kfree(fi);
+}
+
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+ pr_warning("Freeing alive fib_info %p\n", fi);
return;
}
change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
} endfor_nexthops(fi);
fib_info_cnt--;
release_net(fi->fib_net);
- kfree(fi);
+ call_rcu(&fi->rcu, free_fib_info_rcu);
}
void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
spin_unlock_bh(&fib_info_lock);
}
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
#ifdef CONFIG_NET_CLS_ROUTE
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_priority == fi->fib_priority &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
}
/* Check, that the gateway is already configured.
- Used only by redirect accept routine.
+ * Used only by redirect accept routine.
*/
-
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hlist_for_each_entry(nh, node, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD)) {
+ !(nh->nh_flags & RTNH_F_DEAD)) {
spin_unlock(&fib_info_lock);
return 0;
}
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
}
if (state == NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != dflt)
+ if ((state & NUD_VALID) && order != dflt)
return 0;
- if ((state&NUD_VALID) ||
- (*last_idx<0 && order > dflt)) {
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
*last_resort = fi;
*last_idx = order;
}
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
/*
- Picture
- -------
-
- Semantics of nexthop is very messy by historical reasons.
- We have to take into account, that:
- a) gateway can be actually local interface address,
- so that gatewayed route is direct.
- b) gateway must be on-link address, possibly
- described not by an ifaddr, but also by a direct route.
- c) If both gateway and interface are specified, they should not
- contradict.
- d) If we use tunnel routes, gateway could be not on-link.
-
- Attempt to reconcile all of these (alas, self-contradictory) conditions
- results in pretty ugly and hairy code with obscure logic.
-
- I chose to generalized it instead, so that the size
- of code does not increase practically, but it becomes
- much more general.
- Every prefix is assigned a "scope" value: "host" is local address,
- "link" is direct route,
- [ ... "site" ... "interior" ... ]
- and "universe" is true gateway route with global meaning.
-
- Every prefix refers to a set of "nexthop"s (gw, oif),
- where gw must have narrower scope. This recursion stops
- when gw has LOCAL scope or if "nexthop" is declared ONLINK,
- which means that gw is forced to be on link.
-
- Code is still hairy, but now it is apparently logically
- consistent and very flexible. F.e. as by-product it allows
- to co-exists in peace independent exterior and interior
- routing processes.
-
- Normally it looks as following.
-
- {universe prefix} -> (gw, oif) [scope link]
- |
- |-> {link prefix} -> (gw, oif) [scope local]
- |
- |-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
*/
-
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
int err;
struct net *net;
+ struct net_device *dev;
net = cfg->fc_nlinfo.nl_net;
if (nh->nh_gw) {
struct fib_result res;
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
+ if (nh->nh_flags & RTNH_F_ONLINK) {
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
return -ENODEV;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return -ENETDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
return 0;
}
+ rcu_read_lock();
{
struct flowi fl = {
.nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl.fl4_scope < RT_SCOPE_LINK)
fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(net, &fl, &res)) != 0)
+ err = fib_lookup(net, &fl, &res);
+ if (err) {
+ rcu_read_unlock();
return err;
+ }
}
err = -EINVAL;
if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
goto out;
nh->nh_scope = res.scope;
nh->nh_oif = FIB_RES_OIF(res);
- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- fib_res_put(&res);
- return err;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
return -EINVAL;
+ rcu_read_lock();
+ err = -ENODEV;
in_dev = inetdev_by_index(net, nh->nh_oif);
if (in_dev == NULL)
- return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP)) {
- in_dev_put(in_dev);
- return -ENETDOWN;
- }
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
- in_dev_put(in_dev);
+ err = 0;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return err;
}
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
unsigned int mask = (fib_hash_size - 1);
- return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
}
static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
return kzalloc(bytes, GFP_KERNEL);
else
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
}
static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} else {
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
goto failure;
} endfor_nexthops(fi)
}
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
}
link_it:
- if ((ofi = fib_find_info(fi)) != NULL) {
+ ofi = fib_find_info(fi);
+ if (ofi) {
fi->fib_dead = 1;
free_fib_info(fi);
ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen)
+ struct fib_result *res, int prefixlen, int fib_flags)
{
struct fib_alias *fa;
int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
if (fa->fa_scope < flp->fl4_scope)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+ fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
case RTN_ANYCAST:
case RTN_MULTICAST:
for_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
+ if (nh->nh_flags & RTNH_F_DEAD)
continue;
if (!flp->oif || flp->oif == nh->nh_oif)
break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
goto out_fill_res;
}
#else
- if (nhsel < 1) {
+ if (nhsel < 1)
goto out_fill_res;
- }
#endif
endfor_nexthops(fi);
continue;
default:
- printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
- fa->fa_type);
+ pr_warning("fib_semantic_match bad type %#x\n",
+ fa->fa_type);
return -EINVAL;
}
}
@@ -929,7 +950,8 @@ out_fill_res:
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
- atomic_inc(&res->fi->fib_clntref);
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&res->fi->fib_clntref);
return 0;
}
@@ -1028,10 +1050,10 @@ nla_put_failure:
}
/*
- Update FIB if:
- - local address disappeared -> we must delete all the entries
- referring to it.
- - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
*/
int fib_sync_down_addr(struct net *net, __be32 local)
{
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
prev_fi = fi;
dead = 0;
change_nexthops(fi) {
- if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
- Dead device goes up. We wake up dead nexthops.
- It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
*/
-
int fib_sync_up(struct net_device *dev)
{
struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
struct fib_nh *nh;
int ret;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return 0;
prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
alive++;
continue;
}
if (nexthop_nh->nh_dev == NULL ||
- !(nexthop_nh->nh_dev->flags&IFF_UP))
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
continue;
if (nexthop_nh->nh_dev != dev ||
!__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
}
/*
- The algorithm is suboptimal, but it provides really
- fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
*/
-
void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
{
struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
power += nexthop_nh->nh_weight;
nexthop_nh->nh_power = nexthop_nh->nh_weight;
}
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
/* w should be random number [0..fi->fib_power-1],
- it is pretty bad approximation.
+ * it is pretty bad approximation.
*/
w = jiffies % fi->fib_power;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
nexthop_nh->nh_power) {
- if ((w -= nexthop_nh->nh_power) <= 0) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a96e5ec211a..cd5e13aee7d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1342,7 +1342,7 @@ err:
/* should be called with rcu_read_lock */
static int check_leaf(struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen);
+ err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
@@ -1372,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
}
int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
@@ -1384,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
t_key cindex = 0;
int current_prefix_length = KEYLENGTH;
struct tnode *cn;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
+ t_key pref_mismatch;
rcu_read_lock();
@@ -1399,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1424,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1500,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* matching prefix.
*/
- node_prefix = mask_pfx(cn->key, cn->pos);
- key_prefix = mask_pfx(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
+ pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
/*
* In short: If skipped bits in this node do not match
@@ -1511,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* state.directly.
*/
if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch << 1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+ int mp = KEYLENGTH - fls(pref_mismatch);
- if (key_prefix != 0)
+ if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;
if (current_prefix_length >= cn->pos)
@@ -1846,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index b546736da2e..caea6885fdb 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -22,7 +22,7 @@
#include <net/gre.h>
-const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
+static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a1ad0e7180d..c8877c6c721 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
return; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_Query_Response_Interval;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return;
@@ -1257,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
- igmp_mod_timer(im, IGMP_Initial_Report_Delay);
- return;
- }
- /* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- igmp_ifc_event(in_dev);
+ /* a failover is happening and switches
+ * must be notified immediately */
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
#endif
}
EXPORT_SYMBOL(ip_mc_rejoin_group);
@@ -1406,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
write_unlock_bh(&in_dev->mc_list_lock);
}
+/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
struct flowi fl = { .nl_u = { .ip4_u =
@@ -1416,15 +1429,12 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
if (imr->imr_ifindex) {
idev = inetdev_by_index(net, imr->imr_ifindex);
- if (idev)
- __in_dev_put(idev);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(net, imr->imr_address.s_addr);
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
- dev_put(dev);
}
if (!dev && !ip_route_output_key(net, &rt, &fl)) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce32..ba804266584 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
bc += op->no;
}
}
- return (len == 0);
+ return len == 0;
}
static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f4dc879e258..168440834ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
- return (qp->id == arg->iph->id &&
+ return qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
- qp->user == arg->user);
+ qp->user == arg->user;
}
/* Memory Tracking Functions. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 85176895495..d0ffcbe369b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -46,7 +46,7 @@
#include <net/rtnetlink.h>
#include <net/gre.h>
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -64,13 +64,13 @@
We cannot track such dead loops during route installation,
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
- and silently drop packet when it expires. It is the best
+ and silently drop packet when it expires. It is a good
solution, but it supposes maintaing new variable in ALL
skb, even if no tunneling is used.
- Current solution: HARD_TX_LOCK lock breaks dead loops.
-
-
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
2. Networking dead loops would not kill routers, but would really
kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -129,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
static int ipgre_net_id __read_mostly;
struct ipgre_net {
- struct ip_tunnel *tunnels[4][HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
struct net_device *fb_tunnel_dev;
};
@@ -159,13 +159,40 @@ struct ipgre_net {
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipgre_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -174,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
{
struct net *net = dev_net(dev);
int link = dev->ifindex;
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(key);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(key);
struct ip_tunnel *t, *cand = NULL;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -290,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
return NULL;
}
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
- unsigned h = HASH(key);
+ unsigned int h = HASH(key);
int prio = 0;
if (local)
@@ -309,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
return &ign->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel *t)
{
return __ipgre_bucket(ign, &t->parms);
@@ -317,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipgre_bucket(ign, t);
+ struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
- spin_lock_bh(&ipgre_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipgre_lock);
}
static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipgre_lock);
- *tp = t->next;
- spin_unlock_bh(&ipgre_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipgre_bucket(ign, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -347,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
int link = parms->link;
- struct ip_tunnel *t, **tp;
+ struct ip_tunnel *t;
+ struct ip_tunnel __rcu **tp;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+ for (tp = __ipgre_bucket(ign, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next)
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
key == t->parms.i_key &&
@@ -361,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
return t;
}
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
struct ip_tunnel *t, *nt;
@@ -583,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
if ((tunnel = ipgre_tunnel_lookup(skb->dev,
iph->saddr, iph->daddr, key,
gre_proto))) {
- struct net_device_stats *stats = &tunnel->dev->stats;
+ struct pcpu_tstats *tstats;
secpath_reset(skb);
@@ -607,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Looped back packet, drop it! */
if (skb_rtable(skb)->fl.iif == 0)
goto drop;
- stats->multicast++;
+ tunnel->dev->stats.multicast++;
skb->pkt_type = PACKET_BROADCAST;
}
#endif
if (((flags&GRE_CSUM) && csum) ||
(!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- stats->rx_crc_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
if (tunnel->parms.i_flags&GRE_SEQ) {
if (!(flags&GRE_SEQ) ||
(tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- stats->rx_fifo_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
tunnel->i_seqno = seqno + 1;
@@ -631,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
- stats->rx_length_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
@@ -641,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
skb_reset_network_header(skb);
ipgre_ecn_decapsulate(iph, skb);
netif_rx(skb);
+
rcu_read_unlock();
- return(0);
+ return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -656,20 +690,19 @@ drop:
rcu_read_unlock();
drop_nolock:
kfree_skb(skb);
- return(0);
+ return 0;
}
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *tiph;
u8 tos;
__be16 df;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
@@ -691,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
/* NBMA tunnel */
if (skb_dst(skb) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
@@ -700,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((dst = rt->rt_gateway) == 0)
goto tx_error_icmp;
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct in6_addr *addr6;
int addr_type;
@@ -737,14 +770,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ }
+;
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error;
}
}
@@ -752,7 +791,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
@@ -775,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
goto tx_error;
}
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
@@ -815,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
dev->needed_headroom = max_headroom;
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -851,7 +890,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((iph->ttl = tiph->ttl) == 0) {
if (skb->protocol == htons(ETH_P_IP))
iph->ttl = old_iph->ttl;
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6))
iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
#endif
@@ -882,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -910,13 +949,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
/* Guess output device to choose reasonable mtu and needed_headroom */
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
@@ -1013,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
} else {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
t = netdev_priv(dev);
@@ -1126,7 +1171,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
- const void *daddr, const void *saddr, unsigned len)
+ const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1168,13 +1213,19 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi fl = { .oif = t->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = t->parms.iph.daddr,
- .saddr = t->parms.iph.saddr,
- .tos = RT_TOS(t->parms.iph.tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = t->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = t->parms.iph.daddr,
+ .saddr = t->parms.iph.saddr,
+ .tos = RT_TOS(t->parms.iph.tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (ip_route_output_key(dev_net(dev), &rt, &fl))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
@@ -1194,10 +1245,8 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
- if (in_dev) {
+ if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
- in_dev_put(in_dev);
- }
}
return 0;
}
@@ -1214,12 +1263,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
+static void ipgre_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->type = ARPHRD_IPGRE;
dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1257,6 +1313,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1275,7 +1335,7 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
tunnel->hlen = sizeof(struct iphdr) + 4;
dev_hold(dev);
- ign->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
}
@@ -1291,11 +1351,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
for (prio = 0; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ign->tunnels[prio][h];
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(ign->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -1441,6 +1503,10 @@ static int ipgre_tap_init(struct net_device *dev)
ipgre_tunnel_bind_dev(dev);
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1451,6 +1517,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1526,7 @@ static void ipgre_tap_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1554,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
if (!tb[IFLA_MTU])
dev->mtu = mtu;
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
err = register_netdevice(dev);
if (err)
goto out;
@@ -1522,7 +1593,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
t = nt;
if (dev->type != ARPHRD_ETHER) {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
if (ipv4_is_multicast(p.iph.daddr))
nflags = IFF_BROADCAST;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488e..1906fa35860 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
}
return -EINVAL;
}
-
+EXPORT_SYMBOL(ip_options_compile);
/*
* Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e42762023c2..439d2a34ee4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -488,9 +488,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
* we can switch to copy when see the first bad fragment.
*/
if (skb_has_frag_list(skb)) {
- struct sk_buff *frag;
+ struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb);
- int truesizes = 0;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
- goto slow_path;
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
- truesizes += frag->truesize;
+ skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
- skb->truesize -= truesizes;
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
@@ -576,6 +574,15 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
}
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
@@ -926,16 +933,19 @@ alloc_new_skb:
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
- alloclen = datalen + fragheaderlen;
+ alloclen = fraglen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
- if (datalen == length + fraggap)
+ if (datalen == length + fraggap) {
alloclen += rt->dst.trailer_len;
-
+ /* make sure mtu is not reached */
+ if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
+ datalen -= ALIGN(rt->dst.trailer_len, 8);
+ }
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6c40a8c46e7..64b70ad162e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1129,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_HDRINCL:
val = inet->hdrincl;
break;
+ case IP_NODEFRAG:
+ val = inet->nodefrag;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3c6f8f3968a..e9b816e6cd7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
static int ipip_net_id __read_mostly;
struct ipip_net {
- struct ip_tunnel *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel *tunnels_r[HASH_SIZE];
- struct ip_tunnel *tunnels_l[HASH_SIZE];
- struct ip_tunnel *tunnels_wc[1];
- struct ip_tunnel **tunnels[4];
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
struct net_device *fb_tunnel_dev;
};
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipip_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
return NULL;
}
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- unsigned h = 0;
+ unsigned int h = 0;
int prio = 0;
if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
return &ipn->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel *t)
{
return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipip_lock);
- *tp = t->next;
- spin_unlock_bh(&ipip_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip_bucket(ipn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipip_bucket(ipn, t);
+ struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
- spin_lock_bh(&ipip_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipip_lock);
}
static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
struct net_device *dev;
char name[IFNAMSIZ];
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+ for (tp = __ipip_bucket(ipn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
return t;
}
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
if (parms->name[0])
strlcpy(name, parms->name, IFNAMSIZ);
else
- sprintf(name, "tunl%%d");
+ strcpy(name, "tunl%d");
dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
nt->parms = *parms;
- ipip_tunnel_init(dev);
+ if (ipip_tunnel_init(dev) < 0)
+ goto failed_free;
if (register_netdevice(dev) < 0)
goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
return nt;
failed_free:
- free_netdev(dev);
+ ipip_dev_free(dev);
return NULL;
}
+/* called with RTNL */
static void ipip_tunnel_uninit(struct net_device *dev)
{
struct net *net = dev_net(dev);
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- if (dev == ipn->fb_tunnel_dev) {
- spin_lock_bh(&ipip_lock);
- ipn->tunnels_wc[0] = NULL;
- spin_unlock_bh(&ipip_lock);
- } else
+ if (dev == ipn->fb_tunnel_dev)
+ rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+ else
ipip_tunnel_unlink(ipn, netdev_priv(dev));
dev_put(dev);
}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
rcu_read_lock();
- if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
- iph->saddr, iph->daddr)) != NULL) {
+ tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+ if (tunnel != NULL) {
+ struct pcpu_tstats *tstats;
+
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
skb->pkt_type = PACKET_HOST;
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
ipip_ecn_decapsulate(iph, skb);
+
netif_rx(skb);
+
rcu_read_unlock();
return 0;
}
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *tiph = &tunnel->parms.iph;
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
- if (tos&1)
+ if (tos & 1)
tos = old_iph->tos;
if (!dst) {
/* NBMA tunnel */
if ((rt = skb_rtable(skb)) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
if ((dst = rt->rt_gateway) == 0)
@@ -424,14 +461,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
}
@@ -439,7 +482,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
@@ -449,7 +492,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- stats->collisions++;
+ dev->stats.collisions++;
ip_rt_put(rt);
goto tx_error;
}
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->ttl = old_iph->ttl;
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -544,13 +587,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
iph = &tunnel->parms.iph;
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
@@ -696,13 +745,19 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
-
+ .ndo_get_stats = ipip_get_stats,
};
+static void ipip_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +766,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
}
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -725,9 +781,15 @@ static void ipip_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
ipip_tunnel_bind_dev(dev);
+
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
}
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
@@ -740,8 +802,13 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
dev_hold(dev);
- ipn->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+ return 0;
}
static struct xfrm_tunnel ipip_handler __read_mostly = {
@@ -760,11 +827,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ipn->tunnels[prio][h];
+ struct ip_tunnel *t;
+ t = rtnl_dereference(ipn->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -789,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net)
}
dev_net_set(ipn->fb_tunnel_dev, net);
- ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
if ((err = register_netdev(ipn->fb_tunnel_dev)))
goto err_reg_dev;
@@ -797,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net)
return 0;
err_reg_dev:
- free_netdev(ipn->fb_tunnel_dev);
+ ipip_dev_free(ipn->fb_tunnel_dev);
err_alloc_dev:
/* nothing */
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866f..86dd5691af4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
struct net *net;
#endif
u32 id;
- struct sock *mroute_sk;
+ struct sock __rcu *mroute_sk;
struct timer_list ipmr_expire_timer;
struct list_head mfc_unres_queue;
struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
};
/* Big lock, protecting vif table, mrt cache and mroute socket state.
- Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
*/
static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
static DEFINE_SPINLOCK(mfc_unres_lock);
/* We return to original Alan's scheme. Hash table of resolved
- entries is changed only in process context and protected
- with weak lock mrt_lock. Queue of unresolved entries is protected
- with strong spinlock mfc_unres_lock.
-
- In this case data path is free of exclusive locks at all.
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
*/
static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
set_fs(KERNEL_DS);
err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
set_fs(oldfs);
- } else
+ } else {
err = -EOPNOTSUPP;
-
+ }
dev = NULL;
if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
dev->iflink = 0;
rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
rcu_read_unlock();
goto failure;
}
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
mrt->mroute_reg_vif_num = -1;
#endif
- if (vifi+1 == mrt->maxvif) {
+ if (vifi + 1 == mrt->maxvif) {
int tmp;
- for (tmp=vifi-1; tmp>=0; tmp--) {
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
if (VIF_EXISTS(mrt, tmp))
break;
}
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
ip_rt_multicast_event(in_dev);
}
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
dev_put(dev);
return 0;
}
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
{
+ struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
kmem_cache_free(mrt_cachep, c);
}
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
/* Destroy an unresolved cache entry, killing queued skbs
- and reporting error to netlink readers.
+ * and reporting error to netlink readers.
*/
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
memset(&e->msg, 0, sizeof(e->msg));
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
kfree_skb(skb);
+ }
}
ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
- if (dev && dev->ip_ptr == NULL) {
+ if (dev && __in_dev_get_rtnl(dev) == NULL) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
- } else
+ } else {
dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
-
+ }
if (!dev)
return -EADDRNOTAVAIL;
err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EINVAL;
}
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
ip_rt_multicast_event(in_dev);
- /*
- * Fill in the VIF structures
- */
+ /* Fill in the VIF structures */
+
v->rate_limit = vifc->vifc_rate_limit;
v->local = vifc->vifc_lcl_addr.s_addr;
v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->pkt_in = 0;
v->pkt_out = 0;
v->link = dev->ifindex;
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
v->link = dev->iflink;
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
#ifdef CONFIG_IP_PIMSM
- if (v->flags&VIFF_REGISTER)
+ if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
#endif
if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return 0;
}
+/* called with rcu_read_lock() */
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
int line = MFC_HASH(mcastgrp, origin);
struct mfc_cache *c;
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
return c;
}
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
- if (c == NULL)
- return NULL;
- c->mfc_un.res.minvif = MAXVIFS;
+
+ if (c)
+ c->mfc_un.res.minvif = MAXVIFS;
return c;
}
static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
- if (c == NULL)
- return NULL;
- skb_queue_head_init(&c->mfc_un.unres.unresolved);
- c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+ if (c) {
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ }
return c;
}
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct sk_buff *skb;
struct nlmsgerr *e;
- /*
- * Play the pending entries through our router
- */
+ /* Play the pending entries through our router */
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
- nlh->nlmsg_len = (skb_tail_pointer(skb) -
- (u8 *)nlh);
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
ip_mr_forward(net, mrt, skb, c, 0);
+ }
}
}
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
const int ihl = ip_hdrlen(pkt);
struct igmphdr *igmp;
struct igmpmsg *msg;
+ struct sock *mroute_sk;
int ret;
#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT) {
/* Ugly, but we have no choice with this interface.
- Duplicate old header, fix ihl, length etc.
- And all this only to mangle msg->im_msgtype and
- to set msg->im_mbz to "mbz" :-)
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
*/
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
#endif
{
- /*
- * Copy the IP header
- */
+ /* Copy the IP header */
skb->network_header = skb->tail;
skb_put(skb, ihl);
skb_copy_to_linear_data(skb, pkt->data, ihl);
- ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
- /*
- * Add our header
- */
+ /* Add our header */
- igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
igmp->type =
msg->im_msgtype = assert;
- igmp->code = 0;
- ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
skb->transport_header = skb->network_header;
}
- if (mrt->mroute_sk == NULL) {
+ rcu_read_lock();
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk == NULL) {
+ rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
- /*
- * Deliver to mrouted
- */
- ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+ /* Deliver to mrouted */
+
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
+ rcu_read_unlock();
if (ret < 0) {
if (net_ratelimit())
printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
if (!found) {
- /*
- * Create a new entry if allowable
- */
+ /* Create a new entry if allowable */
if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
(c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
return -ENOBUFS;
}
- /*
- * Fill in the new cache entry
- */
+ /* Fill in the new cache entry */
+
c->mfc_parent = -1;
c->mfc_origin = iph->saddr;
c->mfc_mcastgrp = iph->daddr;
- /*
- * Reflect first query at mrouted.
- */
+ /* Reflect first query at mrouted. */
+
err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
}
- /*
- * See if we can append the packet
- */
- if (c->mfc_un.unres.unresolved.qlen>3) {
+ /* See if we can append the packet */
+
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
} else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- write_lock_bh(&mrt_lock);
- list_add(&c->list, &mrt->mfc_cache_array[line]);
- write_unlock_bh(&mrt_lock);
+ list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
/*
* Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
LIST_HEAD(list);
struct mfc_cache *c, *next;
- /*
- * Shut down all active vif entries
- */
+ /* Shut down all active vif entries */
+
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+ if (!(mrt->vif_table[i].flags & VIFF_STATIC))
vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
- /*
- * Wipe the cache
- */
+ /* Wipe the cache */
+
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags&MFC_STATIC)
+ if (c->mfc_flags & MFC_STATIC)
continue;
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
-
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
}
}
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
}
}
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
static void mrtsock_destruct(struct sock *sk)
{
struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
rtnl_lock();
ipmr_for_each_table(mrt, net) {
- if (sk == mrt->mroute_sk) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = NULL;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, NULL);
mroute_clean_tables(mrt);
}
}
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOENT;
if (optname != MRT_INIT) {
- if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+ if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+ !capable(CAP_NET_ADMIN))
return -EACCES;
}
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOPROTOOPT;
rtnl_lock();
- if (mrt->mroute_sk) {
+ if (rtnl_dereference(mrt->mroute_sk)) {
rtnl_unlock();
return -EADDRINUSE;
}
ret = ip_ra_control(sk, 1, mrtsock_destruct);
if (ret == 0) {
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = sk;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
}
rtnl_unlock();
return ret;
case MRT_DONE:
- if (sk != mrt->mroute_sk)
+ if (sk != rcu_dereference_raw(mrt->mroute_sk))
return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENFILE;
rtnl_lock();
if (optname == MRT_ADD_VIF) {
- ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
} else {
ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (optname == MRT_DEL_MFC)
ret = ipmr_mfc_delete(mrt, &mfc);
else
- ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk));
rtnl_unlock();
return ret;
/*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
case MRT_ASSERT:
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
mrt->mroute_do_assert = (v) ? 1 : 0;
return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
v = (v) ? 1 : 0;
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -EINVAL;
if (get_user(v, (u32 __user *)optval))
return -EFAULT;
- if (sk == mrt->mroute_sk)
- return -EBUSY;
rtnl_lock();
ret = 0;
- if (!ipmr_new_table(net, v))
- ret = -ENOMEM;
- raw_sk(sk)->ipmr_table = v;
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ raw_sk(sk)->ipmr_table = v;
+ }
rtnl_unlock();
return ret;
}
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
if (optname != MRT_VERSION &&
#ifdef CONFIG_IP_PIMSM
- optname!=MRT_PIM &&
+ optname != MRT_PIM &&
#endif
- optname!=MRT_ASSERT)
+ optname != MRT_ASSERT)
return -ENOPROTOOPT;
if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (copy_from_user(&sr, arg, sizeof(sr)))
return -EFAULT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
sr.pktcnt = c->mfc_un.res.pkt;
sr.bytecnt = c->mfc_un.res.bytes;
sr.wrong_if = c->mfc_un.res.wrong_if;
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
default:
return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
};
/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
+ * Encapsulate a packet by attaching a valid IPIP header to it.
* This avoids tunnel drivers and other mess and gives us the speed so
* important for multicast video.
*/
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- iph->version = 4;
+ iph->version = 4;
iph->tos = old_iph->tos;
iph->ttl = old_iph->ttl;
iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
static inline int ipmr_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
@@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
}
#endif
- if (vif->flags&VIFF_TUNNEL) {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = vif->remote,
- .saddr = vif->local,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ if (vif->flags & VIFF_TUNNEL) {
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = vif->remote,
+ .saddr = vif->local,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
}
@@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
- allow to send ICMP, so that packets will disappear
- to blackhole.
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
*/
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
ip_decrease_ttl(ip_hdr(skb));
/* FIXME: forward and output firewalls used to be called here.
- * What do we do with netfilter? -- RR */
+ * What do we do with netfilter? -- RR
+ */
if (vif->flags & VIFF_TUNNEL) {
ip_encap(skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (skb_rtable(skb)->fl.iif == 0) {
/* It is our own packet, looped back.
- Very complicated situation...
-
- The best workaround until routing daemons will be
- fixed is not to redistribute packet, if it was
- send through wrong interface. It means, that
- multicast applications WILL NOT work for
- (S,G), which have default multicast route pointing
- to wrong oif. In any case, it is not a good
- idea to use multicasting applications on router.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
*/
goto dont_forward;
}
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
- so that we cannot check that packet arrived on an oif.
- It is bad, but otherwise we would need to move pretty
- large chunk of pimd to kernel. Ough... --ANK
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
*/
(mrt->mroute_do_pim ||
cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
/*
* Forward the frame
*/
- for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+ for (ct = cache->mfc_un.res.maxvif - 1;
+ ct >= cache->mfc_un.res.minvif; ct--) {
if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache,
psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
@@ -1713,6 +1728,7 @@ dont_forward:
/*
* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
*/
int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
int err;
/* Packet is looped back after forward, it should not be
- forwarded second time, but still can be delivered locally.
+ * forwarded second time, but still can be delivered locally.
*/
- if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
}
if (!local) {
- if (IPCB(skb)->opt.router_alert) {
- if (ip_call_ra_chain(skb))
- return 0;
- } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
- /* IGMPv1 (and broken IGMPv2 implementations sort of
- Cisco IOS <= 11.2(8)) do not put router alert
- option to IGMP packets destined to routable
- groups. It is very bad, because it means
- that we can forward NO IGMP messages.
- */
- read_lock(&mrt_lock);
- if (mrt->mroute_sk) {
- nf_reset(skb);
- raw_rcv(mrt->mroute_sk, skb);
- read_unlock(&mrt_lock);
- return 0;
- }
- read_unlock(&mrt_lock);
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
}
}
- read_lock(&mrt_lock);
+ /* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
/*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL) {
- read_unlock(&mrt_lock);
+ if (skb2 == NULL)
return -ENOBUFS;
- }
skb = skb2;
}
+ read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0) {
int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
return -ENODEV;
}
+ read_lock(&mrt_lock);
ip_mr_forward(net, mrt, skb, cache, local);
-
read_unlock(&mrt_lock);
if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
}
#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
unsigned int pimlen)
{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/*
- Check that:
- a. packet is really destinted to a multicast group
- b. packet is not a NULL-REGISTER
- c. packet is not truncated
+ * Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
*/
if (!ipv4_is_multicast(encap->daddr) ||
encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
read_lock(&mrt_lock);
if (mrt->mroute_reg_vif_num >= 0)
reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
read_unlock(&mrt_lock);
if (reg_dev == NULL)
return 1;
skb->mac_header = skb->network_header;
- skb_pull(skb, (u8*)encap - skb->data);
+ skb_pull(skb, (u8 *)encap - skb->data);
skb_reset_network_header(skb);
skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb->pkt_type = PACKET_HOST;
skb_tunnel_rx(skb, reg_dev);
netif_rx(skb);
- dev_put(reg_dev);
- return 0;
+ return NET_RX_SUCCESS;
}
#endif
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
* Handle IGMP messages of PIMv1
*/
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
{
struct igmphdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
#endif
#ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
{
struct pimreghdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
- (pim->flags&PIM_NULL_REGISTER) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
if (mrt == NULL)
return -ENOENT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
struct net_device *dev;
- int vif;
+ int vif = -1;
if (nowait) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EAGAIN;
}
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+ read_lock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
- if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ read_lock(&mrt_lock);
+ if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[1];
s_e = cb->args[2];
- read_lock(&mrt_lock);
+ rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
if (t > s_t)
s_h = 0;
for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
if (e < s_e)
goto next_entry;
if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
t++;
}
done:
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
cb->args[2] = e;
cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
#ifdef CONFIG_PROC_FS
/*
- * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ * The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mr_table *mrt = it->mrt;
struct mfc_cache *mfc;
- read_lock(&mrt_lock);
+ rcu_read_lock();
for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry(mfc, it->cache, list)
+ list_for_each_entry_rcu(mfc, it->cache, list)
if (pos-- == 0)
return mfc;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* exhausted cache_array, show unresolved */
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
it->ct = 0;
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!list_empty(it->cache))
return list_first_entry(it->cache, struct mfc_cache, list);
- end_of_list:
+end_of_list:
spin_unlock_bh(&mfc_unres_lock);
it->cache = NULL;
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
else if (it->cache == &mrt->mfc_cache_array[it->ct])
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
mfc->mfc_un.res.bytes,
mfc->mfc_un.res.wrong_if);
for (n = mfc->mfc_un.res.minvif;
- n < mfc->mfc_un.res.maxvif; n++ ) {
+ n < mfc->mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
mfc->mfc_un.res.ttls[n] < 255)
seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
mrt_cachep = kmem_cache_create("ip_mrt_cache",
sizeof(struct mfc_cache),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
if (!mrt_cachep)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d756edae59e..3cad2591ace 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
for (i = 0; i < len; i++)
ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
- return (ret != 0);
+ return ret != 0;
}
/*
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b254dafaf42..43eec80c0e7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -112,6 +112,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->protocol = htons(ETH_P_IP);
if (ip_route_me_harder(nskb, addr_type))
goto free_nskb;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index eab8de32f20..f3a9b42b16c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,9 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && inet->nodefrag)
+ if (sk && (sk->sk_family == PF_INET) &&
+ inet->nodefrag)
return NF_ACCEPT;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963..ee5f419d0a5 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
unsigned char s[4];
if (offset & 1) {
- s[0] = s[2] = 0;
+ s[0] = ~0;
s[1] = ~*optr;
+ s[2] = 0;
s[3] = *nptr;
} else {
- s[1] = s[3] = 0;
s[0] = ~*optr;
+ s[1] = ~0;
s[2] = *nptr;
+ s[3] = 0;
}
*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e24d48dd99d..d6cb2bfcd8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
.local_out = __ip_local_out,
- .entries = ATOMIC_INIT(0),
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
- atomic_read(&ipv4_dst_ops.entries),
+ dst_entries_get_slow(&ipv4_dst_ops),
st->in_hit,
st->in_slow_tot,
st->in_slow_mc,
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
struct rtable *rth, **rthp;
unsigned long now = jiffies;
int goal;
+ int entries = dst_entries_get_fast(&ipv4_dst_ops);
/*
* Garbage collection is pretty expensive,
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
RT_CACHE_STAT_INC(gc_total);
if (now - last_gc < ip_rt_gc_min_interval &&
- atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+ entries < ip_rt_max_size) {
RT_CACHE_STAT_INC(gc_ignored);
goto out;
}
+ entries = dst_entries_get_slow(&ipv4_dst_ops);
/* Calculate number of entries, which we want to expire now. */
- goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << rt_hash_log);
+ goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
if (goal > 0) {
equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ equilibrium = entries - goal;
}
if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
expire >>= 1;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, i);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, i);
#endif
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
} while (!in_softirq() && time_before_eq(jiffies, now));
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+ goto out;
+ if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
if (net_ratelimit())
printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
work_done:
expire += ip_rt_gc_min_interval;
if (expire > ip_rt_gc_timeout ||
- atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+ dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, rover);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
#endif
out: return 0;
}
@@ -1102,23 +1105,23 @@ restart:
* Note that we do rt_free on this new route entry, so that
* once its refcount hits zero, we are still able to reap it
* (Thanks Alexey)
- * Note also the rt_free uses call_rcu. We don't actually
- * need rcu protection here, this is just our path to get
- * on the route gc list.
+ * Note: To avoid expensive rcu stuff for this uncached dst,
+ * we set DST_NOCACHE so that dst_release() can free dst without
+ * waiting a grace period.
*/
+ rt->dst.flags |= DST_NOCACHE;
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
int err = arp_bind_neighbour(&rt->dst);
if (err) {
if (net_ratelimit())
printk(KERN_WARNING
"Neighbour table failure & not caching routes.\n");
- rt_drop(rt);
+ ip_rt_put(rt);
return err;
}
}
- rt_free(rt);
goto skip_hashing;
}
@@ -1231,7 +1234,7 @@ restart:
}
if (net_ratelimit())
- printk(KERN_WARNING "Neighbour table overflow.\n");
+ printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
rt_drop(rt);
return -ENOBUFS;
}
@@ -1772,12 +1775,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
- src = FIB_RES_PREFSRC(res);
- fib_res_put(&res);
- } else
- src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+ else {
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+ src = FIB_RES_PREFSRC(res);
+ else
+ src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
memcpy(addr, &src, 4);
}
@@ -2080,6 +2086,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2101,7 +2108,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
- int free_res = 0;
struct net * net = dev_net(dev);
/* IP on this device is disabled. */
@@ -2117,7 +2123,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
ipv4_is_loopback(saddr))
goto martian_source;
- if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
/* Accept zero addresses only to limited broadcast;
@@ -2126,19 +2132,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
- ipv4_is_loopback(daddr))
+ if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
goto martian_destination;
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(net, &fl, &res)) != 0) {
+ err = fib_lookup(net, &fl, &res);
+ if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
goto no_route;
}
- free_res = 1;
RT_CACHE_STAT_INC(in_slow_tot);
@@ -2147,8 +2152,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
err = fib_validate_source(saddr, daddr, tos,
- net->loopback_dev->ifindex,
- dev, &spec_dst, &itag, skb->mark);
+ net->loopback_dev->ifindex,
+ dev, &spec_dst, &itag, skb->mark);
if (err < 0)
goto martian_source_keep_err;
if (err)
@@ -2163,9 +2168,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
- if (free_res)
- fib_res_put(&res);
out: return err;
brd_input:
@@ -2225,7 +2227,7 @@ local_input:
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
- goto done;
+ goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
@@ -2248,21 +2250,21 @@ martian_destination:
e_hostunreach:
err = -EHOSTUNREACH;
- goto done;
+ goto out;
e_inval:
err = -EINVAL;
- goto done;
+ goto out;
e_nobufs:
err = -ENOBUFS;
- goto done;
+ goto out;
martian_source:
err = -EINVAL;
martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
- goto done;
+ goto out;
}
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2348,6 +2350,7 @@ skip_cache:
}
EXPORT_SYMBOL(ip_route_input_common);
+/* called with rcu_read_lock() */
static int __mkroute_output(struct rtable **result,
struct fib_result *res,
const struct flowi *fl,
@@ -2358,53 +2361,47 @@ static int __mkroute_output(struct rtable **result,
struct rtable *rth;
struct in_device *in_dev;
u32 tos = RT_FL_TOS(oldflp);
- int err = 0;
- if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
return -EINVAL;
- if (fl->fl4_dst == htonl(0xFFFFFFFF))
+ if (ipv4_is_lbcast(fl->fl4_dst))
res->type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl->fl4_dst))
res->type = RTN_MULTICAST;
- else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+ else if (ipv4_is_zeronet(fl->fl4_dst))
return -EINVAL;
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- /* get work reference to inet device */
- in_dev = in_dev_get(dev_out);
+ in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
return -EINVAL;
if (res->type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- if (res->fi) {
- fib_info_put(res->fi);
- res->fi = NULL;
- }
+ res->fi = NULL;
} else if (res->type == RTN_MULTICAST) {
- flags |= RTCF_MULTICAST|RTCF_LOCAL;
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
flags &= ~RTCF_LOCAL;
/* If multicast route do not exist use
- default one, but do not gateway in this case.
- Yes, it is hack.
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4) {
- fib_info_put(res->fi);
+ if (res->fi && res->prefixlen < 4)
res->fi = NULL;
- }
}
rth = dst_alloc(&ipv4_dst_ops);
- if (!rth) {
- err = -ENOBUFS;
- goto cleanup;
- }
+ if (!rth)
+ return -ENOBUFS;
+
+ in_dev_hold(in_dev);
+ rth->idev = in_dev;
atomic_set(&rth->dst.__refcnt, 1);
rth->dst.flags= DST_HOST;
@@ -2425,7 +2422,6 @@ static int __mkroute_output(struct rtable **result,
cache entry */
rth->dst.dev = dev_out;
dev_hold(dev_out);
- rth->idev = in_dev_get(dev_out);
rth->rt_gateway = fl->fl4_dst;
rth->rt_spec_dst= fl->fl4_src;
@@ -2460,15 +2456,11 @@ static int __mkroute_output(struct rtable **result,
rt_set_nexthop(rth, res, 0);
rth->rt_flags = flags;
-
*result = rth;
- cleanup:
- /* release work reference to inet device */
- in_dev_put(in_dev);
-
- return err;
+ return 0;
}
+/* called with rcu_read_lock() */
static int ip_mkroute_output(struct rtable **rp,
struct fib_result *res,
const struct flowi *fl,
@@ -2490,6 +2482,7 @@ static int ip_mkroute_output(struct rtable **rp,
/*
* Major route resolver routine.
+ * called with rcu_read_lock();
*/
static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2508,9 +2501,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
.iif = net->loopback_dev->ifindex,
.oif = oldflp->oif };
struct fib_result res;
- unsigned flags = 0;
+ unsigned int flags = 0;
struct net_device *dev_out = NULL;
- int free_res = 0;
int err;
@@ -2536,9 +2528,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (oldflp->oif == 0 &&
(ipv4_is_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ ipv4_is_lbcast(oldflp->fl4_dst))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
+ dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
if (dev_out == NULL)
goto out;
@@ -2563,29 +2555,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
- if (dev_out == NULL)
+ if (!__ip_dev_find(net, oldflp->fl4_src, false))
goto out;
- dev_put(dev_out);
- dev_out = NULL;
}
}
if (oldflp->oif) {
- dev_out = dev_get_by_index(net, oldflp->oif);
+ dev_out = dev_get_by_index_rcu(net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
- if (__in_dev_get_rtnl(dev_out) == NULL) {
- dev_put(dev_out);
+ if (rcu_dereference(dev_out->ip_ptr) == NULL)
goto out; /* Wrong error code */
- }
if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+ ipv4_is_lbcast(oldflp->fl4_dst)) {
if (!fl.fl4_src)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2605,10 +2592,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
fl.fl4_dst = fl.fl4_src;
if (!fl.fl4_dst)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
@@ -2642,23 +2626,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
res.type = RTN_UNICAST;
goto make_route;
}
- if (dev_out)
- dev_put(dev_out);
err = -ENETUNREACH;
goto out;
}
- free_res = 1;
if (res.type == RTN_LOCAL) {
if (!fl.fl4_src)
fl.fl4_src = fl.fl4_dst;
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
- if (res.fi)
- fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2675,28 +2651,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
- if (dev_out)
- dev_put(dev_out);
dev_out = FIB_RES_DEV(res);
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
make_route:
err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
-
- if (free_res)
- fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
out: return err;
}
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
- unsigned hash;
+ unsigned int hash;
+ int res;
struct rtable *rth;
if (!rt_caching(net))
@@ -2727,7 +2696,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_unlock_bh();
slow_output:
- return ip_route_output_slow(net, rp, flp);
+ rcu_read_lock();
+ res = ip_route_output_slow(net, rp, flp);
+ rcu_read_unlock();
+ return res;
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2746,7 +2718,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
@@ -2791,7 +2762,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_release(&(*rp)->dst);
*rp = rt;
- return (rt ? 0 : -ENOMEM);
+ return rt ? 0 : -ENOMEM;
}
int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -3316,6 +3287,12 @@ int __init ip_rt_init(void)
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
rt_hash_table = (struct rt_hash_bucket *)
alloc_large_system_hash("IP route cache",
sizeof(struct rt_hash_bucket),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3e8a4dbc721..1664a0590bb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -386,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
*/
mask = 0;
- if (sk->sk_err)
- mask = POLLERR;
/*
* POLLHUP is certainly not done right. But poll() doesn't
@@ -457,6 +455,11 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err)
+ mask |= POLLERR;
+
return mask;
}
EXPORT_SYMBOL(tcp_poll);
@@ -940,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sg = sk->sk_route_caps & NETIF_F_SG;
while (--iovlen >= 0) {
- int seglen = iov->iov_len;
+ size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1bc87a05c73..ee0df481749 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
@@ -2301,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+ return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}
static inline int tcp_head_timedout(struct sock *sk)
@@ -2495,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -2503,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
int err;
unsigned int mss;
- if (packets == 0)
- return;
-
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
} else {
skb = tcp_write_queue_head(sk);
cnt = 0;
@@ -2532,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if (tcp_is_sack(tp) || (oldcnt >= packets))
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
@@ -2543,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
}
tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
}
tcp_verify_left_out(tp);
}
@@ -2554,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1);
+ tcp_mark_head_lost(sk, 1, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost);
+ tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto < fast_rexmit)
- sacked_upto = fast_rexmit;
- tcp_mark_head_lost(sk, sacked_upto);
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
@@ -2873,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
@@ -2970,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
}
@@ -3398,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
- return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3416,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
- return (after(ack, tp->snd_una) ||
+ return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
@@ -4035,6 +4040,8 @@ static void tcp_reset(struct sock *sk)
default:
sk->sk_err = ECONNRESET;
}
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85c..43cf901d765 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return 1;
if (after(end_seq, s_win) && before(seq, e_win))
return 1;
- return (seq == e_win && seq == end_seq);
+ return seq == e_win && seq == end_seq;
}
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ea09d2fd50c..05b1ecf3676 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1370,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
- return (skb->len < mss_now &&
+ return skb->len < mss_now &&
((nonagle & TCP_NAGLE_CORK) ||
- (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
/* Return non-zero if the Nagle test allows this packet to be
@@ -1443,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
- return (skb &&
+ return skb &&
tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH)));
+ tp->nonagle : TCP_NAGLE_PUSH));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index baea4a12902..74a6aa00365 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
/* This function calculates a "timeout" which is equivalent to the timeout of a
* TCP connection after "boundary" unsuccessful, exponentially backed-off
- * retransmissions with an initial RTO of TCP_RTO_MIN.
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
*/
static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
- unsigned int timeout)
+ unsigned int timeout,
+ bool syn_set)
{
unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
if (!inet_csk(sk)->icsk_retransmits)
return false;
@@ -152,12 +155,12 @@ static bool retransmits_timed_out(struct sock *sk,
start_ts = tcp_sk(sk)->retrans_stamp;
if (likely(timeout == 0)) {
- linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
if (boundary <= linear_backoff_thresh)
- timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
+ timeout = ((2 << boundary) - 1) * rto_base;
else
- timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
}
return (tcp_time_stamp - start_ts) >= timeout;
@@ -168,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
- bool do_reset;
+ bool do_reset, syn_set = 0;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = 1;
} else {
- if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -188,7 +192,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
- !retransmits_timed_out(sk, retry_until, 0);
+ !retransmits_timed_out(sk, retry_until, 0, 0);
if (tcp_out_of_resources(sk, do_reset))
return 1;
@@ -196,8 +200,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (retransmits_timed_out(sk, retry_until,
- (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 :
- icsk->icsk_user_timeout)) {
+ syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -364,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
int mib_idx;
- if (icsk->icsk_ca_state == TCP_CA_Disorder) {
- if (tcp_is_sack(tp))
- mib_idx = LINUX_MIB_TCPSACKFAILURES;
- else
- mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
else
mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
} else {
mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
@@ -439,7 +443,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
out:;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a624..a534dda5456 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
*/
static inline u32 westwood_do_filter(u32 a, u32 b)
{
- return (((7 * a) + b) >> 3);
+ return ((7 * a) + b) >> 3;
}
static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 869078d4eeb..4464f3bff6a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
static int xfrm4_get_tos(struct flowi *fl)
{
- return fl->fl4_tos;
+ return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
}
static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
xfrm4_policy_afinfo.garbage_collect(net);
- return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+ return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
.gc_thresh = 1024,
- .entries = ATOMIC_INIT(0),
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
* and start cleaning when were 1/2 full
*/
xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+ dst_entries_init(&xfrm4_dst_ops);
xfrm4_state_init();
xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a0..47947624ecc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
}
static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+{
+ sel->daddr.a4 = fl->fl4_dst;
+ sel->saddr.a4 = fl->fl4_src;
+ sel->dport = xfrm_flowi_dport(fl);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl->proto;
+ sel->ifindex = fl->oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr)
{
- x->sel.daddr.a4 = fl->fl4_dst;
- x->sel.saddr.a4 = fl->fl4_src;
- x->sel.dport = xfrm_flowi_dport(fl);
- x->sel.dport_mask = htons(0xffff);
- x->sel.sport = xfrm_flowi_sport(fl);
- x->sel.sport_mask = htons(0xffff);
- x->sel.family = AF_INET;
- x->sel.prefixlen_d = 32;
- x->sel.prefixlen_s = 32;
- x->sel.proto = fl->proto;
- x->sel.ifindex = fl->oif;
x->id = tmpl->id;
if (x->id.daddr.a4 == 0)
x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.owner = THIS_MODULE,
.init_flags = xfrm4_init_flags,
.init_tempsel = __xfrm4_init_tempsel,
+ .init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
.extract_input = xfrm4_extract_input,
.extract_output = xfrm4_extract_output,