aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/dst.c4
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/decnet/dn_route.c4
-rw-r--r--net/ipv4/arp.c5
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c5
-rw-r--r--net/ipv4/route.c1329
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
-rw-r--r--net/ipv6/route.c4
-rw-r--r--net/sctp/transport.c2
-rw-r--r--net/xfrm/xfrm_policy.c23
21 files changed, 262 insertions, 1174 deletions
diff --git a/net/core/dst.c b/net/core/dst.c
index 07bacff84aa..069d51d2941 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -94,7 +94,7 @@ loop:
* But we do not have state "obsoleted, but
* referenced by parent", so it is right.
*/
- if (dst->obsolete > 1)
+ if (dst->obsolete > 0)
continue;
___dst_free(dst);
@@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst)
*/
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
dst->input = dst->output = dst_discard;
- dst->obsolete = 2;
+ dst->obsolete = DST_OBSOLETE_DEAD;
}
void __dst_free(struct dst_entry *dst)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ab4f44c9bb2..25428d0c50c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -508,7 +508,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
struct dst_entry *dst;
struct flowi4 fl4;
- dst = inet_csk_route_req(sk, &fl4, req, false);
+ dst = inet_csk_route_req(sk, &fl4, req);
if (dst == NULL)
goto out;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 47de90d8fe9..23cc11dd4e4 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1176,7 +1176,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
@@ -1444,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757..a0124eb7dbe 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
return 1;
}
- paddr = skb_rtable(skb)->rt_gateway;
-
+ paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
paddr, dev))
return 0;
@@ -828,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input(skb, tip, sip, 0, dev) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b83203658ee..f277cf0e632 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
rt_cache_flush(dev_net(dev), 0);
break;
case NETDEV_UNREGISTER_BATCH:
- /* The batch unregister is only called on the first
- * device in the list of devices being unregistered.
- * Therefore we should not pass dev_net(dev) in here.
- */
- rt_cache_flush_batch(NULL);
break;
}
return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b57d768240..e55171f184f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -171,6 +171,10 @@ static void free_fib_info_rcu(struct rcu_head *head)
dev_put(nexthop_nh->nh_dev);
if (nexthop_nh->nh_exceptions)
free_nh_exceptions(nexthop_nh);
+ if (nexthop_nh->nh_rth_output)
+ dst_release(&nexthop_nh->nh_rth_output->dst);
+ if (nexthop_nh->nh_rth_input)
+ dst_release(&nexthop_nh->nh_rth_input->dst);
} endfor_nexthops(fi);
release_net(fi->fib_net);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c7a4de05ca0..db0cf17c00f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
- const struct request_sock *req,
- bool nocache)
+ const struct request_sock *req)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
@@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct net *net = sock_net(sk);
int flags = inet_sk_flowi_flags(sk);
- if (nocache)
- flags |= FLOWI_FLAG_RT_NOCACHE;
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol,
@@ -389,7 +386,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+ if (opt && opt->opt.is_strictroute && rt->rt_gateway)
goto route_err;
return &rt->dst;
@@ -422,7 +419,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+ if (opt && opt->opt.is_strictroute && rt->rt_gateway)
goto route_err;
return &rt->dst;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409..7ad88e5e711 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
/* skb dst is stale, drop it, and perform route lookup again */
skb_dst_drop(head);
iph = ip_hdr(head);
- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
+ err = ip_route_input(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
if (err)
goto out_rcu_unlock;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 42c44b1403c..b062a98574f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -766,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (skb->protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
- dst = rt->rt_gateway;
+ dst = rt_nexthop(rt, old_iph->daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d4440f52..4ebc6feee25 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (!skb_dst(skb)) {
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ int err = ip_route_input(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 665abbb7122..ba39a52d18c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2c2c35bace7..99af1f0cc65 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -487,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
dev->stats.tx_fifo_errors++;
goto tx_error;
}
- dst = rt->rt_gateway;
+ dst = rt_nexthop(rt, old_iph->daddr);
}
rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5716c6b808d..8eec8f4a053 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1795,9 +1795,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowi4_tos = RT_TOS(iph->tos),
- .flowi4_oif = rt->rt_oif,
- .flowi4_iif = rt->rt_iif,
- .flowi4_mark = rt->rt_mark,
+ .flowi4_oif = (rt_is_output_route(rt) ?
+ skb->dev->ifindex : 0),
+ .flowi4_iif = (rt_is_output_route(rt) ?
+ net->loopback_dev->ifindex :
+ skb->dev->ifindex),
+ .flowi4_mark = skb->mark,
};
struct mr_table *mrt;
int err;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c79dc8..cbb6a1a6f6f 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
struct nf_nat_ipv4_range newrange;
const struct nf_nat_ipv4_multi_range_compat *mr;
const struct rtable *rt;
- __be32 newsrc;
+ __be32 newsrc, nh;
NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
mr = par->targinfo;
rt = skb_rtable(skb);
- newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
if (!newsrc) {
pr_info("%s ate my IP address\n", par->out->name);
return NF_DROP;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d547f6fae20..9add08869c7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
-static int rt_chain_length_max __read_mostly = 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
/*
* Interface to generic destination cache.
@@ -145,14 +141,12 @@ static unsigned long expires_ljiffies;
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
static unsigned int ipv4_mtu(const struct dst_entry *dst);
-static void ipv4_dst_destroy(struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
-static int rt_garbage_collect(struct dst_ops *ops);
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
@@ -172,12 +166,10 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
- .gc = rt_garbage_collect,
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
- .destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
@@ -209,184 +201,30 @@ const __u8 ip_tos2prio[16] = {
};
EXPORT_SYMBOL(ip_tos2prio);
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- * as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- * they do so with atomic increments and with the
- * lock held.
- */
-
-struct rt_hash_bucket {
- struct rtable __rcu *chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
- defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ 256
-#else
-# if NR_CPUS >= 32
-# define RT_HASH_LOCK_SZ 4096
-# elif NR_CPUS >= 16
-# define RT_HASH_LOCK_SZ 2048
-# elif NR_CPUS >= 8
-# define RT_HASH_LOCK_SZ 1024
-# elif NR_CPUS >= 4
-# define RT_HASH_LOCK_SZ 512
-# else
-# define RT_HASH_LOCK_SZ 256
-# endif
-#endif
-
-static spinlock_t *rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
- int i;
-
- rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
- GFP_KERNEL);
- if (!rt_hash_locks)
- panic("IP: failed to allocate rt_hash_locks\n");
-
- for (i = 0; i < RT_HASH_LOCK_SZ; i++)
- spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket *rt_hash_table __read_mostly;
-static unsigned int rt_hash_mask __read_mostly;
-static unsigned int rt_hash_log __read_mostly;
-
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
- int genid)
-{
- return jhash_3words((__force u32)daddr, (__force u32)saddr,
- idx, genid)
- & rt_hash_mask;
-}
-
static inline int rt_genid(struct net *net)
{
return atomic_read(&net->ipv4.rt_genid);
}
#ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
- struct seq_net_private p;
- int bucket;
- int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
- struct rt_cache_iter_state *st = seq->private;
- struct rtable *r = NULL;
-
- for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
- if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
- continue;
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- while (r) {
- if (dev_net(r->dst.dev) == seq_file_net(seq) &&
- r->rt_genid == st->genid)
- return r;
- r = rcu_dereference_bh(r->dst.rt_next);
- }
- rcu_read_unlock_bh();
- }
- return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
-
- r = rcu_dereference_bh(r->dst.rt_next);
- while (!r) {
- rcu_read_unlock_bh();
- do {
- if (--st->bucket < 0)
- return NULL;
- } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
- while ((r = __rt_cache_get_next(seq, r)) != NULL) {
- if (dev_net(r->dst.dev) != seq_file_net(seq))
- continue;
- if (r->rt_genid == st->genid)
- break;
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct rtable *r = rt_cache_get_first(seq);
-
- if (r)
- while (pos && (r = rt_cache_get_next(seq, r)))
- --pos;
- return pos ? NULL : r;
-}
-
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct rt_cache_iter_state *st = seq->private;
if (*pos)
- return rt_cache_get_idx(seq, *pos - 1);
- st->genid = rt_genid(seq_file_net(seq));
+ return NULL;
return SEQ_START_TOKEN;
}
static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct rtable *r;
-
- if (v == SEQ_START_TOKEN)
- r = rt_cache_get_first(seq);
- else
- r = rt_cache_get_next(seq, v);
++*pos;
- return r;
+ return NULL;
}
static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
- if (v && v != SEQ_START_TOKEN)
- rcu_read_unlock_bh();
}
static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
"HHUptod\tSpecDst");
- else {
- struct rtable *r = v;
- int len;
-
- seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- dst_metric_advmss(&r->dst) + 40,
- dst_metric(&r->dst, RTAX_WINDOW), 0,
- r->rt_key_tos,
- -1, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
- }
return 0;
}
@@ -426,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {
static int rt_cache_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &rt_cache_seq_ops,
- sizeof(struct rt_cache_iter_state));
+ return seq_open(file, &rt_cache_seq_ops);
}
static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {
.open = rt_cache_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_net,
+ .release = seq_release,
};
@@ -625,263 +444,12 @@ static inline int ip_rt_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
- ip_rt_put(rt);
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
- /* Kill broadcast/multicast entries very aggresively, if they
- collide in hash table with more useful entries */
- return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
- return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
- unsigned long age;
- int ret = 0;
-
- if (atomic_read(&rth->dst.__refcnt))
- goto out;
-
- age = jiffies - rth->dst.lastuse;
- if ((age <= tmo1 && !rt_fast_clean(rth)) ||
- (age <= tmo2 && rt_valuable(rth)))
- goto out;
- ret = 1;
-out: return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
- u32 score = jiffies - rt->dst.lastuse;
-
- score = ~score & ~(3<<30);
-
- if (rt_valuable(rt))
- score |= (1<<31);
-
- if (rt_is_output_route(rt) ||
- !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
- score |= (1<<30);
-
- return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
- return net->ipv4.current_rt_cache_rebuild_count <=
- net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
- const struct rtable *rt2)
-{
- return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
- ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
- (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
- return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
- ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
- (rt1->rt_mark ^ rt2->rt_mark) |
- (rt1->rt_key_tos ^ rt2->rt_key_tos) |
- (rt1->rt_route_iif ^ rt2->rt_route_iif) |
- (rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
- return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
static inline int rt_is_expired(struct rtable *rth)
{
return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
}
/*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
- unsigned int i;
- struct rtable *rth, *next;
-
- for (i = 0; i <= rt_hash_mask; i++) {
- struct rtable __rcu **pprev;
- struct rtable *list;
-
- if (process_context && need_resched())
- cond_resched();
- rth = rcu_access_pointer(rt_hash_table[i].chain);
- if (!rth)
- continue;
-
- spin_lock_bh(rt_hash_lock_addr(i));
-
- list = NULL;
- pprev = &rt_hash_table[i].chain;
- rth = rcu_dereference_protected(*pprev,
- lockdep_is_held(rt_hash_lock_addr(i)));
-
- while (rth) {
- next = rcu_dereference_protected(rth->dst.rt_next,
- lockdep_is_held(rt_hash_lock_addr(i)));
-
- if (!net ||
- net_eq(dev_net(rth->dst.dev), net)) {
- rcu_assign_pointer(*pprev, next);
- rcu_assign_pointer(rth->dst.rt_next, list);
- list = rth;
- } else {
- pprev = &rth->dst.rt_next;
- }
- rth = next;
- }
-
- spin_unlock_bh(rt_hash_lock_addr(i));
-
- for (; list; list = next) {
- next = rcu_dereference_protected(list->dst.rt_next, 1);
- rt_free(list);
- }
- }
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- * rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
- const struct rtable *aux = head;
-
- while (aux != rth) {
- if (compare_hash_inputs(aux, rth))
- return 0;
- aux = rcu_dereference_protected(aux->dst.rt_next, 1);
- }
- return ONE;
-}
-
-static void rt_check_expire(void)
-{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth) ||
- rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
-
- /* We only count entries on a chain with equal
- * hash inputs once so that entries for
- * different QOS levels, and other non-hash
- * input attributes don't unfairly skew the
- * length computation
- */
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- length += has_noalias(rt_hash_table[i].chain, rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
-/*
* Perturbation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
* many times (2^24) without giving recent rt_genid.
@@ -902,167 +470,6 @@ static void rt_cache_invalidate(struct net *net)
void rt_cache_flush(struct net *net, int delay)
{
rt_cache_invalidate(net);
- if (delay >= 0)
- rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
- rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
- net_warn_ratelimited("Route hash chain too long!\n");
- rt_cache_invalidate(net);
-}
-
-/*
- Short description of GC goals.
-
- We want to build algorithm, which will keep routing cache
- at some equilibrium point, when number of aged off entries
- is kept approximately equal to newly generated ones.
-
- Current expiration strength is variable "expire".
- We try to adjust it dynamically, so that if networking
- is idle expires is large enough to keep enough of warm entries,
- and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
- static unsigned long expire = RT_GC_TIMEOUT;
- static unsigned long last_gc;
- static int rover;
- static int equilibrium;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long now = jiffies;
- int goal;
- int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
- /*
- * Garbage collection is pretty expensive,
- * do not make it too frequently.
- */
-
- RT_CACHE_STAT_INC(gc_total);
-
- if (now - last_gc < ip_rt_gc_min_interval &&
- entries < ip_rt_max_size) {
- RT_CACHE_STAT_INC(gc_ignored);
- goto out;
- }
-
- entries = dst_entries_get_slow(&ipv4_dst_ops);
- /* Calculate number of entries, which we want to expire now. */
- goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
- if (goal <= 0) {
- if (equilibrium < ipv4_dst_ops.gc_thresh)
- equilibrium = ipv4_dst_ops.gc_thresh;
- goal = entries - equilibrium;
- if (goal > 0) {
- equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = entries - equilibrium;
- }
- } else {
- /* We are in dangerous area. Try to reduce cache really
- * aggressively.
- */
- goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = entries - goal;
- }
-
- if (now - last_gc >= ip_rt_gc_min_interval)
- last_gc = now;
-
- if (goal <= 0) {
- equilibrium += goal;
- goto work_done;
- }
-
- do {
- int i, k;
-
- for (i = rt_hash_mask, k = rover; i >= 0; i--) {
- unsigned long tmo = expire;
-
- k = (k + 1) & rt_hash_mask;
- rthp = &rt_hash_table[k].chain;
- spin_lock_bh(rt_hash_lock_addr(k));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
- if (!rt_is_expired(rth) &&
- !rt_may_expire(rth, tmo, expire)) {
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- continue;
- }
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- goal--;
- }
- spin_unlock_bh(rt_hash_lock_addr(k));
- if (goal <= 0)
- break;
- }
- rover = k;
-
- if (goal <= 0)
- goto work_done;
-
- /* Goal is not achieved. We stop process if:
-
- - if expire reduced to zero. Otherwise, expire is halfed.
- - if table is not full.
-