aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-10 22:53:57 -0700
committerDavid S. Miller <davem@davemloft.net>2012-07-10 22:53:57 -0700
commitfdd28d7328f4c3ddf77ba163e257d7dc48392767 (patch)
treef8d4221951e6da429f7d252bc167e8d64a83d1f1
parentad7eee98bef92481581060801bdfd1b25a6106c0 (diff)
parentf185071ddf799e194ba015d040d3d49cdbfa7e48 (diff)
Merge branch 'metrics_restructure'
This patch series works towards the goal of minimizing the amount of things that can change in an ipv4 route. In a regime where the routing cache is removed, route changes will lead to cloning in the FIB tables or similar. The largest trigger of route metrics writes, TCP, now has it's own cache of dynamic metric state. The timewait timestamps are stored there now as well. As a result of that, pre-cowing metrics is no longer necessary, and therefore FLOWI_FLAG_PRECOW_METRICS is removed. Redirect and PMTU handling is moved back into the ipv4 routes. I'm sorry for all the headaches trying to do this in the inetpeer has caused, it was the wrong approach for sure. Since metrics become read-only for ipv4 we no longer need the inetpeer hung off of the ipv4 routes either. So those disappear too. Also, timewait sockets no longer need to hold onto an inetpeer either. After this series, we still have some details to resolve wrt. PMTU and redirects for a route-cache-less system: 1) With just the plain route cache removal, PMTU will continue to work mostly fine. This is because of how the local route users call down into the PMTU update code with the route they already hold. However, if we wish to cache pre-computed routes in fib_info nexthops (which we want for performance), then we need to add route cloning for PMTU events. 2) Redirects require more work. First, redirects must be changed to be handled like PMTU. Wherein we call down into the sockets and other entities, and then they call back into the routing code with the route they were using. So we'll be adding an ->update_nexthop() method alongside ->update_pmtu(). And then, like for PMTU, we'll need cloning support once we start caching routes in the fib_info nexthops. But that's it, we can completely pull the trigger and remove the routing cache with minimal disruptions. As it is, this patch series alone helps a lot of things. For one, routing cache entry creation should be a lot faster, because we no longer do inetpeer lookups (even to check if an entry exists). This patch series also opens the door for non-DST_HOST ipv4 routes, because nothing fundamentally cares about rt->rt_dst any more. It can be removed with the base routing cache removal patch. In fact, that was the primary goal of this patch series. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/rtnetlink.h3
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/dst.h6
-rw-r--r--include/net/flow.h5
-rw-r--r--include/net/inet_connection_sock.h1
-rw-r--r--include/net/inet_sock.h2
-rw-r--r--include/net/inetpeer.h8
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--include/net/route.h61
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/core/rtnetlink.c4
-rw-r--r--net/decnet/dn_route.c13
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inetpeer.c4
-rw-r--r--net/ipv4/route.c349
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c188
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_metrics.c697
-rw-r--r--net/ipv4/tcp_minisocks.c62
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/ip6_output.c10
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/route.c16
-rw-r--r--net/ipv6/tcp_ipv6.c49
29 files changed, 837 insertions, 731 deletions
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ea60b085410..db71c4ad862 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
- u32 id, u32 ts, u32 tsage, long expires,
- u32 error);
+ u32 id, long expires, u32 error);
extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7d3bcedc062..2de9cf46f9f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,7 +506,6 @@ struct tcp_timewait_sock {
u32 tw_rcv_wnd;
u32 tw_ts_recent;
long tw_ts_recent_stamp;
- struct inet_peer *tw_peer;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
#endif
diff --git a/include/net/dst.h b/include/net/dst.h
index b2634e44661..51610468c63 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -209,12 +209,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
return msecs_to_jiffies(dst_metric(dst, metric));
}
-static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
- unsigned long rtt)
-{
- dst_metric_set(dst, metric, jiffies_to_msecs(rtt));
-}
-
static inline u32
dst_allfrag(const struct dst_entry *dst)
{
diff --git a/include/net/flow.h b/include/net/flow.h
index bd524f59856..ce9cb7656b4 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -20,9 +20,8 @@ struct flowi_common {
__u8 flowic_proto;
__u8 flowic_flags;
#define FLOWI_FLAG_ANYSRC 0x01
-#define FLOWI_FLAG_PRECOW_METRICS 0x02
-#define FLOWI_FLAG_CAN_SLEEP 0x04
-#define FLOWI_FLAG_RT_NOCACHE 0x08
+#define FLOWI_FLAG_CAN_SLEEP 0x02
+#define FLOWI_FLAG_RT_NOCACHE 0x04
__u32 flowic_secid;
};
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index af3c743a40e..291e7cee14e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops {
struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
- struct inet_peer *(*get_peer)(struct sock *sk);
u16 net_header_len;
u16 net_frag_header_len;
u16 sockaddr_len;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ae17e1352d7..924d7b98ab6 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
flags |= FLOWI_FLAG_ANYSRC;
- if (sk->sk_protocol == IPPROTO_TCP)
- flags |= FLOWI_FLAG_PRECOW_METRICS;
return flags;
}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index c27c8f10ebd..53f464d7cdd 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -36,25 +36,19 @@ struct inet_peer {
u32 metrics[RTAX_MAX];
u32 rate_tokens; /* rate limiting for ICMP */
unsigned long rate_last;
- unsigned long pmtu_expires;
- u32 pmtu_orig;
- u32 pmtu_learned;
- struct inetpeer_addr_base redirect_learned;
union {
struct list_head gc_list;
struct rcu_head gc_rcu;
};
/*
* Once inet_peer is queued for deletion (refcnt == -1), following fields
- * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp
+ * are not available: rid, ip_id_count
* We can share memory with rcu_head to help keep inet_peer small.
*/
union {
struct {
atomic_t rid; /* Frag reception counter */
atomic_t ip_id_count; /* IP ID for the next packet */
- __u32 tcp_ts;
- __u32 tcp_ts_stamp;
};
struct rcu_head rcu;
struct inet_peer *gc_next;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 599e48fa97c..2e089a99d60 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
#include <net/inet_frag.h>
+struct tcpm_hash_bucket;
struct ctl_table_header;
struct ipv4_devconf;
struct fib_rules_ops;
@@ -39,6 +40,8 @@ struct netns_ipv4 {
struct sock **icmp_sk;
struct sock *tcp_sock;
struct inet_peer_base *peers;
+ struct tcpm_hash_bucket *tcp_metrics_hash;
+ unsigned int tcp_metrics_hash_mask;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
diff --git a/include/net/route.h b/include/net/route.h
index 211e2665139..52362368af0 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -40,7 +40,6 @@
#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
struct fib_nh;
-struct inet_peer;
struct fib_info;
struct rtable {
struct dst_entry dst;
@@ -65,45 +64,10 @@ struct rtable {
__be32 rt_gateway;
/* Miscellaneous cached information */
- u32 rt_peer_genid;
- unsigned long _peer; /* long-living peer info */
+ u32 rt_pmtu;
struct fib_info *fi; /* for client ref to shared metrics */
};
-static inline struct inet_peer *rt_peer_ptr(struct rtable *rt)
-{
- return inetpeer_ptr(rt->_peer);
-}
-
-static inline bool rt_has_peer(struct rtable *rt)
-{
- return inetpeer_ptr_is_peer(rt->_peer);
-}
-
-static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
- __inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
- return inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base)
-{
- inetpeer_init_ptr(&rt->_peer, base);
-}
-
-static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort)
-{
- rt->_peer = ort->_peer;
- if (rt_has_peer(ort)) {
- struct inet_peer *peer = rt_peer_ptr(ort);
- atomic_inc(&peer->refcnt);
- }
-}
-
static inline bool rt_is_input_route(const struct rtable *rt)
{
return rt->rt_route_iif != 0;
@@ -278,8 +242,6 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
if (inet_sk(sk)->transparent)
flow_flags |= FLOWI_FLAG_ANYSRC;
- if (protocol == IPPROTO_TCP)
- flow_flags |= FLOWI_FLAG_PRECOW_METRICS;
if (can_sleep)
flow_flags |= FLOWI_FLAG_CAN_SLEEP;
@@ -328,27 +290,6 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
return rt;
}
-extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
-
-static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
-{
- if (rt_has_peer(rt))
- return rt_peer_ptr(rt);
-
- rt_bind_peer(rt, daddr, create);
- return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL);
-}
-
-static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
-{
- return __rt_get_peer(rt, daddr, 0);
-}
-
-static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr)
-{
- return __rt_get_peer(rt, daddr, 1);
-}
-
static inline int inet_iif(const struct sk_buff *skb)
{
return skb_rtable(skb)->rt_iif;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d81417..3618fefae04 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,13 @@ extern void tcp_enter_frto(struct sock *sk);
extern void tcp_enter_loss(struct sock *sk, int how);
extern void tcp_clear_retrans(struct tcp_sock *tp);
extern void tcp_update_metrics(struct sock *sk);
+extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_metrics_init(void);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
+extern bool tcp_remember_stamp(struct sock *sk);
+extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
+extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
extern void tcp_init_sock(struct sock *sk);
extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +563,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return (tp->srtt >> 3) + tp->rttvar;
}
+extern void tcp_set_rto(struct sock *sk);
+
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2b325c340b4..64127eee786 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -615,7 +615,7 @@ nla_put_failure:
EXPORT_SYMBOL(rtnetlink_put_metrics);
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- u32 ts, u32 tsage, long expires, u32 error)
+ long expires, u32 error)
{
struct rta_cacheinfo ci = {
.rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
@@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
.rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
- .rta_ts = ts,
- .rta_tsage = tsage,
};
if (expires)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6e74b3f110b..b5594cc73ee 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
goto errout;
expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
- if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires,
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
rt->dst.error) < 0)
goto errout;
@@ -1812,12 +1812,11 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
- rt->dst.dev ? rt->dst.dev->name : "*",
- dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
- dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
- atomic_read(&rt->dst.__refcnt),
- rt->dst.__use,
- (int) dst_metric(&rt->dst, RTAX_RTT));
+ rt->dst.dev ? rt->dst.dev->name : "*",
+ dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
+ dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
+ atomic_read(&rt->dst.__refcnt),
+ rt->dst.__use, 0);
return 0;
}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6..5a23e8b3710 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ae301c897a1..d71bfbdc0bf 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
val = nla_get_u32(nla);
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
fi->fib_metrics[type - 1] = val;
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4bce5a2830a..4a049449305 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -254,9 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
/* Limit if icmp type is enabled in ratemask. */
if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
- struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr);
+ struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
+ inet_putpeer(peer);
}
out:
return rc;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 034ddbe42ad..76825be3b64 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -375,7 +375,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options_rcu *opt = inet_rsk(req)->opt;
struct net *net = sock_net(sk);
- int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS;
+ int flags = inet_sk_flowi_flags(sk);
if (nocache)
flags |= FLOWI_FLAG_RT_NOCACHE;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da90a8cab61..e1e0a4e8fd3 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -508,13 +508,9 @@ relookup:
(daddr->family == AF_INET) ?
secure_ip_id(daddr->addr.a4) :
secure_ipv6_id(daddr->addr.a6));
- p->tcp_ts_stamp = 0;
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
p->rate_last = 0;
- p->pmtu_expires = 0;
- p->pmtu_orig = 0;
- memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 72e88c20802..95bfa1ba5b2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -158,34 +158,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
- u32 *p = NULL;
-
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- u32 *old_p = __DST_METRICS_PTR(old);
- unsigned long prev, new;
-
- p = peer->metrics;
- if (inet_metrics_new(peer))
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
- new = (unsigned long) p;
- prev = cmpxchg(&dst->_metrics, old, new);
-
- if (prev != old) {
- p = __DST_METRICS_PTR(prev);
- if (prev & DST_METRICS_READ_ONLY)
- p = NULL;
- } else {
- if (rt->fi) {
- fib_info_put(rt->fi);
- rt->fi = NULL;
- }
- }
- }
- return p;
+ WARN_ON(1);
+ return NULL;
}
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -423,18 +397,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
int len;
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- dst_metric_advmss(&r->dst) + 40,
- dst_metric(&r->dst, RTAX_WINDOW),
- (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
- dst_metric(&r->dst, RTAX_RTTVAR)),
- r->rt_key_tos,
- -1, 0, 0, &len);
+ "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+ r->dst.dev ? r->dst.dev->name : "*",
+ (__force u32)r->rt_dst,
+ (__force u32)r->rt_gateway,
+ r->rt_flags, atomic_read(&r->dst.__refcnt),
+ r->dst.__use, 0, (__force u32)r->rt_src,
+ dst_metric_advmss(&r->dst) + 40,
+ dst_metric(&r->dst, RTAX_WINDOW), 0,
+ r->rt_key_tos,
+ -1, 0, 0, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
}
@@ -671,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
+ rth->dst.expires;
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -917,7 +889,6 @@ static void rt_cache_invalidate(struct net *net)
get_random_bytes(&shuffle, sizeof(shuffle));
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
- inetpeer_invalidate_family(AF_INET);
}
/*
@@ -1244,31 +1215,6 @@ skip_hashing:
return rt;
}
-static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt_peer_genid(void)
-{
- return atomic_read(&__rt_peer_genid);
-}
-
-void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-{
- struct inet_peer_base *base;
- struct inet_peer *peer;
-
- base = inetpeer_base_ptr(rt->_peer);
- if (!base)
- return;
-
- peer = inet_getpeer_v4(base, daddr, create);
- if (peer) {
- if (!rt_set_peer(rt, peer))
- inet_putpeer(peer);
- else
- rt->rt_peer_genid = rt_peer_genid();
- }
-}
-
/*
* Peer allocation may fail only in serious out-of-memory conditions. However
* we still can generate some output.
@@ -1291,20 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph)
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt && !(rt->dst.flags & DST_NOPEER)) {
- struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
+ struct net *net = dev_net(dst->dev);
+ struct inet_peer *peer;
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (peer) {
- iph->id = htons(inet_getid(peer, more));
- return;
- }
- } else if (!rt)
- pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
+ peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
+ if (peer) {
+ iph->id = htons(inet_getid(peer, more));
+ inet_putpeer(peer);
+ return;
+ }
ip_select_fb_ident(iph);
}
@@ -1330,30 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt)
spin_unlock_bh(rt_hash_lock_addr(hash));
}
-static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
- struct rtable *rt = (struct rtable *) dst;
- __be32 orig_gw = rt->rt_gateway;
- struct neighbour *n;
-
- dst_confirm(&rt->dst);
-
- rt->rt_gateway = peer->redirect_learned.a4;
-
- n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
- if (!n) {
- rt->rt_gateway = orig_gw;
- return;
- }
- if (!(n->nud_state & NUD_VALID)) {
- neigh_event_send(n, NULL);
- } else {
- rt->rt_flags |= RTCF_REDIRECTED;
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
- }
- neigh_release(n);
-}
-
/* called in rcu_read_lock() section */
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
@@ -1362,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
struct in_device *in_dev = __in_dev_get_rcu(dev);
__be32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
- struct inet_peer *peer;
struct net *net;
if (!in_dev)
@@ -1395,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rthp = &rt_hash_table[hash].chain;
while ((rt = rcu_dereference(*rthp)) != NULL) {
+ struct neighbour *n;
+
rthp = &rt->dst.rt_next;
if (rt->rt_key_dst != daddr ||
@@ -1408,13 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->rt_gateway != old_gw)
continue;
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- if (peer->redirect_learned.a4 != new_gw) {
- peer->redirect_learned.a4 = new_gw;
- atomic_inc(&__rt_peer_genid);
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (n) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ rt->rt_gateway = new_gw;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
- check_peer_redir(&rt->dst, peer);
+ neigh_release(n);
}
}
}
@@ -1432,23 +1353,6 @@ reject_redirect:
;
}
-static bool peer_pmtu_expired(struct inet_peer *peer)
-{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
- return orig &&
- time_after_eq(jiffies, orig) &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
-static bool peer_pmtu_cleaned(struct inet_peer *peer)
-{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
- return orig &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *)dst;
@@ -1458,16 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if (rt->rt_flags & RTCF_REDIRECTED) {
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires) {
unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
rt->rt_oif,
rt_genid(dev_net(dst->dev)));
rt_del(hash, rt);
ret = NULL;
- } else if (rt_has_peer(rt)) {
- struct inet_peer *peer = rt_peer_ptr(rt);
- if (peer_pmtu_expired(peer))
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
}
}
return ret;
@@ -1494,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
struct inet_peer *peer;
+ struct net *net;
int log_martians;
rcu_read_lock();
@@ -1505,7 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
- peer = rt_get_peer_create(rt, rt->rt_dst);
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
if (!peer) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
return;
@@ -1522,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (peer->rate_tokens >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- return;
+ goto out_put_peer;
}
/* Check for load limit; set rate_last to the latest sent
@@ -1543,6 +1446,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
&rt->rt_dst, &rt->rt_gateway);
#endif
}
+out_put_peer:
+ inet_putpeer(peer);
}
static int ip_error(struct sk_buff *skb)
@@ -1585,7 +1490,7 @@ static int ip_error(struct sk_buff *skb)
break;
}
- peer = rt_get_peer_create(rt, rt->rt_dst);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
send = true;
if (peer) {
@@ -1598,6 +1503,7 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
+ inet_putpeer(peer);
}
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1606,50 +1512,17 @@ out: kfree_skb(skb);
return 0;
}
-static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
-{
- unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
-
- if (!expires)
- return;
- if (time_before(jiffies, expires)) {
- u32 orig_dst_mtu = dst_mtu(dst);
- if (peer->pmtu_learned < orig_dst_mtu) {
- if (!peer->pmtu_orig)
- peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
- }
- } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
-}
-
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
dst_confirm(dst);
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-
- if (mtu < ip_rt_min_pmtu)
- mtu = ip_rt_min_pmtu;
- if (!pmtu_expires || mtu < peer->pmtu_learned) {
-
- pmtu_expires = jiffies + ip_rt_mtu_expires;
- if (!pmtu_expires)
- pmtu_expires = 1UL;
-
- peer->pmtu_learned = mtu;
- peer->pmtu_expires = pmtu_expires;
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
- atomic_inc(&__rt_peer_genid);
- rt->rt_peer_genid = rt_peer_genid();
- }
- check_peer_pmtu(dst, peer);
- }
+ rt->rt_pmtu = mtu;
+ dst_set_expires(&rt->dst, ip_rt_mtu_expires);
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1660,7 +1533,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
struct rtable *rt;
flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
- protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
+ protocol, flow_flags,
iph->daddr, iph->saddr, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1681,30 +1554,12 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-static void ipv4_validate_peer(struct rtable *rt)
-{
- if (rt->rt_peer_genid != rt_peer_genid()) {
- struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
-
- if (peer) {
- check_peer_pmtu(&rt->dst, peer);
-
- if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway)
- check_peer_redir(&rt->dst, peer);