diff options
Diffstat (limited to 'net/core/neighbour.c')
| -rw-r--r-- | net/core/neighbour.c | 2589 |
1 files changed, 1529 insertions, 1060 deletions
diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e68700f950a..ef31fef25e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -15,54 +15,50 @@ * Harald Welte Add neighbour cache statistics like rtstat */ -#include <linux/config.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> -#include <linux/sched.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> +#include <net/net_namespace.h> #include <net/neighbour.h> #include <net/dst.h> #include <net/sock.h> +#include <net/netevent.h> +#include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> +#include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#define DEBUG #define NEIGH_DEBUG 1 - -#define NEIGH_PRINTK(x...) printk(x) -#define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK0 NEIGH_PRINTK -#define NEIGH_PRINTK1 NEIGH_NOPRINTK -#define NEIGH_PRINTK2 NEIGH_NOPRINTK - -#if NEIGH_DEBUG >= 1 -#undef NEIGH_PRINTK1 -#define NEIGH_PRINTK1 NEIGH_PRINTK -#endif -#if NEIGH_DEBUG >= 2 -#undef NEIGH_PRINTK2 -#define NEIGH_PRINTK2 NEIGH_PRINTK -#endif +#define neigh_dbg(level, fmt, ...) \ +do { \ + if (level <= NEIGH_DEBUG) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(unsigned long arg); -#ifdef CONFIG_ARPD -static void neigh_app_notify(struct neighbour *n); -#endif +static void __neigh_notify(struct neighbour *n, int type, int flags); +static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS -static struct file_operations neigh_stat_seq_fops; +static const struct file_operations neigh_stat_seq_fops; #endif /* @@ -98,12 +94,21 @@ static struct file_operations neigh_stat_seq_fops; static DEFINE_RWLOCK(neigh_tbl_lock); -static int neigh_blackhole(struct sk_buff *skb) +static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } +static void neigh_cleanup_and_release(struct neighbour *neigh) +{ + if (neigh->parms->neigh_cleanup) + neigh->parms->neigh_cleanup(neigh); + + __neigh_notify(neigh, RTM_DELNEIGH, 0); + neigh_release(neigh); +} + /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, @@ -112,23 +117,29 @@ static int neigh_blackhole(struct sk_buff *skb) unsigned long neigh_rand_reach_time(unsigned long base) { - return (base ? (net_random() % base) + (base >> 1) : 0); + return base ? (prandom_u32() % base) + (base >> 1) : 0; } +EXPORT_SYMBOL(neigh_rand_reach_time); static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct neighbour *n; + struct neighbour __rcu **np; - np = &tbl->hash_buckets[i]; - while ((n = *np) != NULL) { + np = &nht->hash_buckets[i]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent @@ -136,11 +147,13 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & NUD_PERMANENT)) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; shrunk = 1; write_unlock(&n->lock); - neigh_release(n); + neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); @@ -155,6 +168,16 @@ static int neigh_forced_gc(struct neigh_table *tbl) return shrunk; } +static void neigh_add_timer(struct neighbour *n, unsigned long when) +{ + neigh_hold(n); + if (unlikely(mod_timer(&n->timer, when))) { + printk("NEIGH: BUG, double timer add, state is %x\n", + n->nud_state); + dump_stack(); + } +} + static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && @@ -178,16 +201,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list) static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; + struct neigh_hash_table *nht; - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np = &tbl->hash_buckets[i]; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - while ((n = *np) != NULL) { + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; @@ -202,16 +233,17 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) we must kill timers etc. and move it to safe state. */ - skb_queue_purge(&n->arp_queue); + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; - NEIGH_PRINTK2("neigh %p is stray.\n", n); + neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); - neigh_release(n); + neigh_cleanup_and_release(n); } } } @@ -222,6 +254,7 @@ void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) neigh_flush_dev(tbl, dev); write_unlock_bh(&tbl->lock); } +EXPORT_SYMBOL(neigh_changeaddr); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { @@ -234,8 +267,9 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) pneigh_queue_purge(&tbl->proxy_queue); return 0; } +EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -250,21 +284,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) goto out_entries; } - n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); + n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; - memset(n, 0, tbl->entry_size); - - skb_queue_head_init(&n->arp_queue); + __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); - init_timer(&n->timer); - n->timer.function = neigh_timer_handler; - n->timer.data = (unsigned long)n; + setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; @@ -278,67 +310,93 @@ out_entries: goto out; } -static struct neighbour **neigh_hash_alloc(unsigned int entries) +static void neigh_get_hash_rnd(u32 *x) { - unsigned long size = entries * sizeof(struct neighbour *); - struct neighbour **ret; + get_random_bytes(x, sizeof(*x)); + *x |= 1; +} - if (size <= PAGE_SIZE) { - ret = kmalloc(size, GFP_ATOMIC); - } else { - ret = (struct neighbour **) - __get_free_pages(GFP_ATOMIC, get_order(size)); - } - if (ret) - memset(ret, 0, size); +static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) +{ + size_t size = (1 << shift) * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour __rcu **buckets; + int i; + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour __rcu **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; + } + ret->hash_buckets = buckets; + ret->hash_shift = shift; + for (i = 0; i < NEIGH_NUM_HASH_RND; i++) + neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } -static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +static void neigh_hash_free_rcu(struct rcu_head *head) { - unsigned long size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); + struct neighbour __rcu **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) - kfree(hash); + kfree(buckets); else - free_pages((unsigned long)hash, get_order(size)); + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); } -static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_shift) { - struct neighbour **new_hash, **old_hash; - unsigned int i, new_hash_mask, old_entries; + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); - BUG_ON(new_entries & (new_entries - 1)); - new_hash = neigh_hash_alloc(new_entries); - if (!new_hash) - return; - - old_entries = tbl->hash_mask + 1; - new_hash_mask = new_entries - 1; - old_hash = tbl->hash_buckets; + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_shift); + if (!new_nht) + return old_nht; - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - for (i = 0; i < old_entries; i++) { + for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct neighbour *n, *next; - for (n = old_hash[i]; n; n = next) { - unsigned int hash_val = tbl->hash(n->primary_key, n->dev); - - hash_val &= new_hash_mask; - next = n->next; - - n->next = new_hash[hash_val]; - new_hash[hash_val] = n; + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); + + hash >>= (32 - new_nht->hash_shift); + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } - tbl->hash_buckets = new_hash; - tbl->hash_mask = new_hash_mask; - neigh_hash_free(old_hash, old_entries); + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -346,49 +404,70 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; - + u32 hash_val; + struct neigh_hash_table *nht; + NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + + rcu_read_unlock_bh(); return n; } +EXPORT_SYMBOL(neigh_lookup); -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, + const void *pkey) { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { - if (!memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!memcmp(n->primary_key, pkey, key_len) && + net_eq(dev_net(n->dev), net)) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + + rcu_read_unlock_bh(); return n; } +EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev) +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); + struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); @@ -405,6 +484,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } + if (dev->netdev_ops->ndo_neigh_construct) { + error = dev->netdev_ops->ndo_neigh_construct(n); + if (error < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + } + /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { @@ -412,34 +499,44 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } - n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) - neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) + nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { - neigh_hold(n1); + if (want_ref) + neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } - n->next = tbl->hash_buckets[hash_val]; - tbl->hash_buckets[hash_val] = n; n->dead = 0; - neigh_hold(n); + if (want_ref) + neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); - NEIGH_PRINTK2("neigh %p is created.\n", n); + neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; @@ -449,37 +546,68 @@ out_neigh_release: neigh_release(n); goto out; } +EXPORT_SYMBOL(__neigh_create); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, int creat) +static u32 pneigh_hash(const void *pkey, int key_len) { - struct pneigh_entry *n; - int key_len = tbl->key_len; u32 hash_val = *(u32 *)(pkey + key_len - 4); - hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; + return hash_val; +} - read_lock_bh(&tbl->lock); - - for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { +static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, + struct net *net, + const void *pkey, + int key_len, + struct net_device *dev) +{ + while (n) { if (!memcmp(n->key, pkey, key_len) && - (n->dev == dev || !n->dev)) { - read_unlock_bh(&tbl->lock); - goto out; - } + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; + n = n->next; } + return NULL; +} + +struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, struct net_device *dev) +{ + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); +} +EXPORT_SYMBOL_GPL(__pneigh_lookup); + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); - n = NULL; - if (!creat) + + if (n || !creat) goto out; + ASSERT_RTNL(); + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; + write_pnet(&n->net, hold_net(net)); memcpy(n->key, pkey, key_len); n->dev = dev; if (dev) @@ -488,6 +616,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, if (tbl->pconstructor && tbl->pconstructor(n)) { if (dev) dev_put(dev); + release_net(net); kfree(n); n = NULL; goto out; @@ -500,30 +629,28 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, out: return n; } +EXPORT_SYMBOL(pneigh_lookup); -int pneigh_delete(struct neigh_table *tbl, const void *pkey, +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; int key_len = tbl->key_len; - u32 hash_val = *(u32 *)(pkey + key_len - 4); - - hash_val ^= (hash_val >> 16); - hash_val ^= hash_val >> 8; - hash_val ^= hash_val >> 4; - hash_val &= PNEIGH_HASHMASK; + u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { - if (!memcmp(n->key, pkey, key_len) && n->dev == dev) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + net_eq(pneigh_net(n), net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); + release_net(pneigh_net(n)); kfree(n); return 0; } @@ -546,6 +673,7 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); + release_net(pneigh_net(n)); kfree(n); continue; } @@ -555,6 +683,13 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) return -ENOENT; } +static void neigh_parms_destroy(struct neigh_parms *parms); + +static inline void neigh_parms_put(struct neigh_parms *parms) +{ + if (atomic_dec_and_test(&parms->refcnt)) + neigh_parms_destroy(parms); +} /* * neighbour must already be out of the table; @@ -562,43 +697,36 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) */ void neigh_destroy(struct neighbour *neigh) { - struct hh_cache *hh; + struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { - printk(KERN_WARNING - "Destroying alive neighbour %p\n", neigh); + pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) - printk(KERN_WARNING "Impossible event.\n"); + pr_warn("Impossible event\n"); - while ((hh = neigh->hh) != NULL) { - neigh->hh = hh->hh_next; - hh->hh_next = NULL; - write_lock_bh(&hh->hh_lock); - hh->hh_output = neigh_blackhole; - write_unlock_bh(&hh->hh_lock); - if (atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); - } - - if (neigh->ops && neigh->ops->destructor) - (neigh->ops->destructor)(neigh); + write_lock_bh(&neigh->lock); + __skb_queue_purge(&neigh->arp_queue); + write_unlock_bh(&neigh->lock); + neigh->arp_queue_len_bytes = 0; - skb_queue_purge(&neigh->arp_queue); + if (dev->netdev_ops->ndo_neigh_destroy) + dev->netdev_ops->ndo_neigh_destroy(neigh); - dev_put(neigh->dev); + dev_put(dev); neigh_parms_put(neigh->parms); - NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); + kfree_rcu(neigh, rcu); } +EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. @@ -607,14 +735,9 @@ void neigh_destroy(struct neighbour *neigh) */ static void neigh_suspect(struct neighbour *neigh) { - struct hh_cache *hh; - - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->output = neigh->ops->output; - - for (hh = neigh->hh; hh; hh = hh->hh_next) - hh->hh_output = neigh->ops->output; } /* Neighbour state is OK; @@ -624,99 +747,137 @@ static void neigh_suspect(struct neighbour *neigh) */ static void neigh_connect(struct neighbour *neigh) { - struct hh_cache *hh; - - NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + neigh_dbg(2, "neigh %p is connected\n", neigh); neigh->output = neigh->ops->connected_output; - - for (hh = neigh->hh; hh; hh = hh->hh_next) - hh->hh_output = neigh->ops->hh_output; } -static void neigh_periodic_timer(unsigned long arg) +static void neigh_periodic_work(struct work_struct *work) { - struct neigh_table *tbl = (struct neigh_table *)arg; - struct neighbour *n, **np; - unsigned long expire, now = jiffies; + struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neighbour *n; + struct neighbour __rcu **np; + unsigned int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock(&tbl->lock); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ - if (time_after(now, tbl->last_rand + 300 * HZ)) { + if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = now; + tbl->last_rand = jiffies; for (p = &tbl->parms; p; p = p->next) p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } - np = &tbl->hash_buckets[tbl->hash_chain_gc]; - tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask); + if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + goto out; + + for (i = 0 ; i < (1 << nht->hash_shift); i++) { + np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { - unsigned int state; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + unsigned int state; - write_lock(&n->lock); + write_lock(&n->lock); - state = n->nud_state; - if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { - write_unlock(&n->lock); - goto next_elt; - } + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } - if (time_before(n->used, n->confirmed)) - n->used = n->confirmed; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 1 && - (state == NUD_FAILED || - time_after(now, n->used + n->parms->gc_staletime))) { - *np = n->next; - n->dead = 1; + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } write_unlock(&n->lock); - neigh_release(n); - continue; - } - write_unlock(&n->lock); next_elt: - np = &n->next; + np = &n->next; + } + /* + * It's fine to release lock here, even if hash table + * grows while we are preempted. + */ + write_unlock_bh(&tbl->lock); + cond_resched(); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); } - - /* Cycle through all hash buckets every base_reachable_time/2 ticks. - * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 - * base_reachable_time. +out: + /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. + * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 + * BASE_REACHABLE_TIME. */ - expire = tbl->parms.base_reachable_time >> 1; - expire /= (tbl->hash_mask + 1); - if (!expire) - expire = 1; - - mod_timer(&tbl->gc_timer, now + expire); - - write_unlock(&tbl->lock); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); + write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE ? - p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes); + int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); + if (!(n->nud_state & NUD_PROBE)) + max_probes += NEIGH_VAR(p, MCAST_PROBES); + return max_probes; } -static inline void neigh_add_timer(struct neighbour *n, unsigned long when) +static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) { - if (unlikely(mod_timer(&n->timer, when))) { - printk("NEIGH: BUG, double timer add, state is %x\n", - n->nud_state); - dump_stack(); + struct sk_buff *skb; + + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + neigh_dbg(2, "neigh %p is failed\n", neigh); + neigh->updated = jiffies; + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + +static void neigh_probe(struct neighbour *neigh) + __releases(neigh->lock) +{ + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_copy(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + kfree_skb(skb); } /* Called when a timer expires for a neighbour entry. */ @@ -725,7 +886,7 @@ static void neigh_timer_handler(unsigned long arg) { unsigned long now, next; struct neighbour *neigh = (struct neighbour *)arg; - unsigned state; + unsigned int state; int notify = 0; write_lock(&neigh->lock); @@ -734,68 +895,57 @@ static void neigh_timer_handler(unsigned long arg) now = jiffies; next = now + HZ; - if (!(state & NUD_IN_TIMER)) { -#ifndef CONFIG_SMP - printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); -#endif + if (!(state & NUD_IN_TIMER)) goto out; - } if (state & NUD_REACHABLE) { - if (time_before_eq(now, + if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { - NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, - neigh->used + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->used + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; + neigh->updated = jiffies; neigh_suspect(neigh); - next = now + neigh->parms->delay_probe_time; + next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { - NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->nud_state = NUD_STALE; + neigh->updated = jiffies; neigh_suspect(neigh); + notify = 1; } } else if (state & NUD_DELAY) { - if (time_before_eq(now, - neigh->confirmed + neigh->parms->delay_probe_time)) { - NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + if (time_before_eq(now, + neigh->confirmed + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is now reachable\n", neigh); neigh->nud_state = NUD_REACHABLE; + neigh->updated = jiffies; neigh_connect(neigh); + notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { - NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh_dbg(2, "neigh %p is probed\n", neigh); neigh->nud_state = NUD_PROBE; + neigh->updated = jiffies; atomic_set(&neigh->probes, 0); - next = now + neigh->parms->retrans_time; + next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + neigh->parms->retrans_time; + next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - struct sk_buff *skb; - neigh->nud_state = NUD_FAILED; notify = 1; - NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); - NEIGH_PRINTK2("neigh %p is failed.\n", neigh); - - /* It is very thin place. report_unreachable is very complicated - routine. Particularly, it can hit the same neighbour entry! - - So that, we try to be accurate and avoid dead loop. --ANK - */ - while (neigh->nud_state == NUD_FAILED && - (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - write_unlock(&neigh->lock); - neigh->ops->error_report(neigh, skb); - write_lock(&neigh->lock); - } - skb_queue_purge(&neigh->arp_queue); + neigh_invalidate(neigh); + goto out; } if (neigh->nud_state & NUD_IN_TIMER) { @@ -805,31 +955,22 @@ static void neigh_timer_handler(unsigned long arg) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); - /* keep skb alive even if arp_queue overflows */ - if (skb) - skb_get(skb); - write_unlock(&neigh->lock); - neigh->ops->solicit(neigh, skb); - atomic_inc(&neigh->probes); - if (skb) - kfree_skb(skb); + neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } -#ifdef CONFIG_ARPD - if (notify && neigh->parms->app_probes) - neigh_app_notify(neigh); -#endif + if (notify) + neigh_update_notify(neigh); + neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; - unsigned long now; + bool immediate_probe = false; write_lock_bh(&neigh->lock); @@ -837,59 +978,79 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; - now = jiffies; - if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { - if (neigh->parms->mcast_probes + neigh->parms->app_probes) { - atomic_set(&neigh->probes, neigh->parms->ucast_probes); + if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + + NEIGH_VAR(neigh->parms, APP_PROBES)) { + unsigned long next, now = jiffies; + + atomic_set(&neigh->probes, + NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh->nud_state = NUD_INCOMPLETE; - neigh_hold(neigh); - neigh_add_timer(neigh, now + 1); + neigh->updated = now; + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/2); + neigh_add_timer(neigh, next); + immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; + neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - if (skb) - kfree_skb(skb); + kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { - NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); - neigh_hold(neigh); + neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; - neigh_add_timer(neigh, - jiffies + neigh->parms->delay_probe_time); + neigh->updated = jiffies; + neigh_add_timer(neigh, jiffies + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { - if (skb_queue_len(&neigh->arp_queue) >= - neigh->parms->queue_len) { + while (neigh->arp_queue_len_bytes + skb->truesize > + NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; - buff = neigh->arp_queue.next; - __skb_unlink(buff, &neigh->arp_queue); + + buff = __skb_dequeue(&neigh->arp_queue); + if (!buff) + break; + neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); + NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } + skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); + neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: - write_unlock_bh(&neigh->lock); + if (immediate_probe) + neigh_probe(neigh); + else + write_unlock(&neigh->lock); + local_bh_enable(); return rc; } +EXPORT_SYMBOL(__neigh_event_send); -static __inline__ void neigh_update_hhs(struct neighbour *neigh) +static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; - void (*update)(struct hh_cache*, struct net_device*, unsigned char *) = - neigh->dev->header_cache_update; + void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; if (update) { - for (hh = neigh->hh; hh; hh = hh->hh_next) { - write_lock_bh(&hh->hh_lock); + hh = &neigh->hh; + if (hh->hh_len) { + write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); - write_unlock_bh(&hh->hh_lock); + write_sequnlock_bh(&hh->hh_lock); } } } @@ -903,13 +1064,13 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh) NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" - lladdr instead of overriding it + lladdr instead of overriding it if it is different. It also allows to retain current state if lladdr is unchanged. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. @@ -922,9 +1083,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, { u8 old; int err; -#ifdef CONFIG_ARPD int notify = 0; -#endif struct net_device *dev; int update_isrouter = 0; @@ -934,7 +1093,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, old = neigh->nud_state; err = -EPERM; - if (!(flags & NEIGH_UPDATE_F_ADMIN) && + if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; @@ -944,9 +1103,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_suspect(neigh); neigh->nud_state = new; err = 0; -#ifdef CONFIG_ARPD notify = old & NUD_VALID; -#endif + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + (new & NUD_FAILED)) { + neigh_invalidate(neigh); + notify = 1; + } goto out; } @@ -960,7 +1122,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - compare new & old - if they are different, check override flag */ - if ((old & NUD_VALID) && + if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { @@ -1002,25 +1164,24 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (new != old) { neigh_del_timer(neigh); - if (new & NUD_IN_TIMER) { - neigh_hold(neigh); - neigh_add_timer(neigh, (jiffies + - ((new & NUD_REACHABLE) ? + if (new & NUD_IN_TIMER) + neigh_add_timer(neigh, (jiffies + + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - } neigh->nud_state = new; + notify = 1; } if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - - (neigh->parms->base_reachable_time << 1); -#ifdef CONFIG_ARPD + (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; -#endif } if (new == old) goto out; @@ -1035,15 +1196,34 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - struct neighbour *n1 = neigh; + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); - /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (skb->dst && skb->dst->neighbour) - n1 = skb->dst->neighbour; - n1->output(skb); + + rcu_read_lock(); + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } + n1->output(n1, skb); + if (n2) + neigh_release(n2); + rcu_read_unlock(); + write_lock_bh(&neigh->lock); } - skb_queue_purge(&neigh->arp_queue); + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) { @@ -1052,12 +1232,28 @@ out: (neigh->flags & ~NTF_ROUTER); } write_unlock_bh(&neigh->lock); -#ifdef CONFIG_ARPD - if (notify && neigh->parms->app_probes) - neigh_app_notify(neigh); -#endif + + if (notify) + neigh_update_notify(neigh); + return err; } +EXPORT_SYMBOL(neigh_update); + +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ + neigh->updated = jiffies; + if (!(neigh->nud_state & NUD_FAILED)) + return; + neigh->nud_state = NUD_INCOMPLETE; + atomic_set(&neigh->probes, neigh_max_probes(neigh)); + neigh_add_timer(neigh, + jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, @@ -1066,158 +1262,145 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, + neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE); return neigh; } +EXPORT_SYMBOL(neigh_event_ns); -static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, - u16 protocol) +/* called with read_lock_bh(&n->lock); */ +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) { - struct hh_cache *hh; struct net_device *dev = dst->dev; + __be16 prot = dst->ops->protocol; + struct hh_cache *hh = &n->hh; - for (hh = n->hh; hh; hh = hh->hh_next) - if (hh->hh_type == protocol) - break; + write_lock_bh(&n->lock); - if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { - memset(hh, 0, sizeof(struct hh_cache)); - rwlock_init(&hh->hh_lock); - hh->hh_type = protocol; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; - if (dev->hard_header_cache(n, hh)) { - kfree(hh); - hh = NULL; - } else { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = n->hh; - n->hh = hh; - if (n->nud_state & NUD_CONNECTED) - hh->hh_output = n->ops->hh_output; - else - hh->hh_output = n->ops->output; - } - } - if (hh) { - atomic_inc(&hh->hh_refcnt); - dst->hh = hh; - } + /* Only one thread can come in here and initialize the + * hh_cache entry. + */ + if (!hh->hh_len) + dev->header_ops->cache(n, hh, prot); + + write_unlock_bh(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit - worked, f.e. if you want to override normal output path (eql, shaper), - but resolution is not made yet. + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. */ -int neigh_compat_output(struct sk_buff *skb) +int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = skb->dev; - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); - if (dev->hard_header && - dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, - skb->len) < 0 && - dev->rebuild_header(skb)) + if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev_rebuild_header(skb)) return 0; return dev_queue_xmit(skb); } +EXPORT_SYMBOL(neigh_compat_output); /* Slow and careful. */ -int neigh_resolve_output(struct sk_buff *skb) +int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; - struct neighbour *neigh; + struct dst_entry *dst = skb_dst(skb); int rc = 0; - if (!dst || !(neigh = dst->neighbour)) + if (!dst) goto discard; - __skb_pull(skb, skb->nh.raw - skb->data); - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->hard_header_cache && !dst->hh) { - write_lock_bh(&neigh->lock); - if (!dst->hh) - neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - write_unlock_bh(&neigh->lock); - } else { - read_lock_bh(&neigh->lock); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); - } + unsigned int seq; + + if (dev->header_ops->cache && !neigh->hh.hh_len) + neigh_hh_init(neigh, dst); + + do { + __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) - rc = neigh->ops->queue_xmit(skb); + rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; discard: - NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", - dst, dst ? dst->neighbour : NULL); + neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; } +EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ -int neigh_connected_output(struct sk_buff *skb) +int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { - int err; - struct dst_entry *dst = skb->dst; - struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; + unsigned int seq; + int err; - __skb_pull(skb, skb->nh.raw - skb->data); + do { + __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); - read_lock_bh(&neigh->lock); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); if (err >= 0) - err = neigh->ops->queue_xmit(skb); + err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb(skb); } return err; } +EXPORT_SYMBOL(neigh_connected_output); + +int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_direct_output); static void neigh_proxy_process(unsigned long arg) { struct neigh_table *tbl = (struct neigh_table *)arg; long sched_next = 0; unsigned long now = jiffies; - struct sk_buff *skb; + struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); - skb = tbl->proxy_queue.next; - - while (skb != (struct sk_buff *)&tbl->proxy_queue) { - struct sk_buff *back = skb; - long tdif = NEIGH_CB(back)->sched_next - now; + skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { + long tdif = NEIGH_CB(skb)->sched_next - now; - skb = skb->next; if (tdif <= 0) { - struct net_device *dev = back->dev; - __skb_unlink(back, &tbl->proxy_queue); - if (tbl->proxy_redo && netif_running(dev)) - tbl->proxy_redo(back); - else - kfree_skb(back); + struct net_device *dev = skb->dev; + + __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { + rcu_read_lock(); + tbl->proxy_redo(skb); + rcu_read_unlock(); + } else { + kfree_skb(skb); + } dev_put(dev); } else if (!sched_next || tdif < sched_next) @@ -1233,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long now = jiffies; - unsigned long sched_next = now + (net_random() % p->proxy_delay); - if (tbl->proxy_queue.qlen > p->proxy_qlen) { + unsigned long sched_next = now + (prandom_u32() % + NEIGH_VAR(p, PROXY_DELAY)); + + if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } @@ -1248,44 +1433,63 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } - dst_release(skb->dst); - skb->dst = NULL; + skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } +EXPORT_SYMBOL(pneigh_enqueue); + +static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, + struct net *net, int ifindex) +{ + struct neigh_parms *p; + + for (p = &tbl->parms; p; p = p->next) { + if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || + (!p->dev && !ifindex && net_eq(net, &init_net))) + return p; + } + return NULL; +} struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { - struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL); + struct neigh_parms *p; + struct net *net = dev_net(dev); + const struct net_device_ops *ops = dev->netdev_ops; + p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { - memcpy(p, &tbl->parms, sizeof(*p)); p->tbl = tbl; atomic_set(&p->refcnt, 1); - INIT_RCU_HEAD(&p->rcu_head); p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); - if (dev) { - if (dev->neigh_setup && dev->neigh_setup(dev, p)) { - kfree(p); - return NULL; - } + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; - dev_hold(dev); - p->dev = dev; + if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + release_net(net); + dev_put(dev); + kfree(p); + return NULL; } - p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); p->next = tbl->parms.next; tbl->parms.next = p; write_unlock_bh(&tbl->lock); + + neigh_parms_data_state_cleanall(p); } return p; } +EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { @@ -1314,90 +1518,97 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } } write_unlock_bh(&tbl->lock); - NEIGH_PRINTK1("neigh_parms_release: not found\n"); + neigh_dbg(1, "%s: not found\n", __func__); } +EXPORT_SYMBOL(neigh_parms_release); -void neigh_parms_destroy(struct neigh_parms *parms) +static void neigh_parms_destroy(struct neigh_parms *parms) { + release_net(neigh_parms_net(parms)); kfree(parms); } +static struct lock_class_key neigh_table_proxy_queue_class; -void neigh_table_init(struct neigh_table *tbl) +static void neigh_table_init_no_netlink(struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); - INIT_RCU_HEAD(&tbl->parms.rcu_head); tbl->parms.reachable_time = - neigh_rand_reach_time(tbl->parms.base_reachable_time); - - if (!tbl->kmem_cachep) - tbl->kmem_cachep = kmem_cache_create(tbl->id, - tbl->entry_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - - if (!tbl->kmem_cachep) - panic("cannot create neighbour cache"); + neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); - + #ifdef CONFIG_PROC_FS - tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); - if (!tbl->pde) + if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat, + &neigh_stat_seq_fops, tbl)) panic("cannot create neighbour proc dir entry"); - tbl->pde->proc_fops = &neigh_stat_seq_fops; - tbl->pde->data = tbl; #endif - tbl->hash_mask = 1; - tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); - tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL); + tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); - if (!tbl->hash_buckets || !tbl->phash_buckets) + if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); - memset(tbl->phash_buckets, 0, phsize); - - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + if (!tbl->entry_size) + tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + + tbl->key_len, NEIGH_PRIV_ALIGN); + else + WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); - init_timer(&tbl->gc_timer); - tbl->gc_timer.data = (unsigned long)tbl; - tbl->gc_timer.function = neigh_periodic_timer; - tbl->gc_timer.expires = now + 1; - add_timer(&tbl->gc_timer); - - init_timer(&tbl->proxy_timer); - tbl->proxy_timer.data = (unsigned long)tbl; - tbl->proxy_timer.function = neigh_proxy_process; - skb_queue_head_init(&tbl->proxy_queue); + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); + setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); + skb_queue_head_init_class(&tbl->proxy_queue, + &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; +} + +void neigh_table_init(struct neigh_table *tbl) +{ + struct neigh_table *tmp; + + neigh_table_init_no_netlink(tbl); write_lock(&neigh_tbl_lock); + for (tmp = neigh_tables; tmp; tmp = tmp->next) { + if (tmp->family == tbl->family) + break; + } tbl->next = neigh_tables; neigh_tables = tbl; write_unlock(&neigh_tbl_lock); + + if (unlikely(tmp)) { + pr_err("Registering multiple tables for family %d\n", + tbl->family); + dump_stack(); + } } +EXPORT_SYMBOL(neigh_table_init); int neigh_table_clear(struct neigh_table *tbl) { struct neigh_table **tp; /* It is not clean... Fix it to unload IPv6 module safely */ - del_timer_sync(&tbl->gc_timer); + cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) - printk(KERN_CRIT "neighbour leakage\n"); + pr_crit("neighbour leakage\n"); write_lock(&neigh_tbl_lock); for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { if (*tp == tbl) { @@ -1407,216 +1618,274 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); - tbl->hash_buckets = NULL; + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); + tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; + remove_proc_entry(tbl->id, init_net.proc_net_stat); + + free_percpu(tbl->stats); + tbl->stats = NULL; + return 0; } +EXPORT_SYMBOL(neigh_table_clear); -int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct ndmsg *ndm = NLMSG_DATA(nlh); - struct rtattr **nda = arg; + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *dst_attr; struct neigh_table *tbl; struct net_device *dev = NULL; - int err = -ENODEV; + int err = -EINVAL; - if (ndm->ndm_ifindex && - (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; + dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); + if (dst_attr == NULL) + goto out; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + } + read_lock(&neigh_tbl_lock); for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct rtattr *dst_attr = nda[NDA_DST - 1]; - struct neighbour *n; + struct neighbour *neigh; if (tbl->family != ndm->ndm_family) continue; read_unlock(&neigh_tbl_lock); - err = -EINVAL; - if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) - goto out_dev_put; + if (nla_len(dst_attr) < tbl->key_len) + goto out; if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev); - goto out_dev_put; + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; } - if (!dev) + if (dev == NULL) goto out; - n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); - if (n) { - err = neigh_update(n, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_ADMIN); - neigh_release(n); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; + goto out; } - goto out_dev_put; + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + goto out; } read_unlock(&neigh_tbl_lock); - err = -EADDRNOTAVAIL; -out_dev_put: - if (dev) - dev_put(dev); + err = -EAFNOSUPPORT; + out: return err; } -int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct ndmsg *ndm = NLMSG_DATA(nlh); - struct rtattr **nda = arg; + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; - int err = -ENODEV; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + goto out; - if (ndm->ndm_ifindex && - (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + err = -EINVAL; + if (tb[NDA_DST] == NULL) goto out; + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + goto out; + } + read_lock(&neigh_tbl_lock); for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1]; - struct rtattr *dst_attr = nda[NDA_DST - 1]; - int override = 1; - struct neighbour *n; + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; + struct neighbour *neigh; + void *dst, *lladdr; if (tbl->family != ndm->ndm_family) continue; read_unlock(&neigh_tbl_lock); - err = -EINVAL; - if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) - goto out_dev_put; + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + err = -ENOBUFS; - if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1)) + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; err = 0; - goto out_dev_put; + } + goto out; } - err = -EINVAL; - if (!dev) + if (dev == NULL) goto out; - if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len) - goto out_dev_put; - - n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); - if (n) { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; - neigh_release(n); - goto out_dev_put; + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out; + } + + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; } - - override = nlh->nlmsg_flags & NLM_F_REPLACE; - } else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out_dev_put; } else { - n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev); - if (IS_ERR(n)) { - err = PTR_ERR(n); - goto out_dev_put; + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); + goto out; } - } - err = neigh_update(n, - lladdr_attr ? RTA_DATA(lladdr_attr) : NULL, - ndm->ndm_state, - (override ? NEIGH_UPDATE_F_OVERRIDE : 0) | - NEIGH_UPDATE_F_ADMIN); + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; + } - neigh_release(n); - goto out_dev_put; + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + goto out; } read_unlock(&neigh_tbl_lock); - err = -EADDRNOTAVAIL; -out_dev_put: - if (dev) - dev_put(dev); + err = -EAFNOSUPPORT; out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { - struct rtattr *nest = NULL; - - nest = RTA_NEST(skb, NDTA_PARMS); + struct nlattr *nest; - if (parms->dev) - RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); - - RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); - RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); - RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); - RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); - RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); - RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes); - RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time); - RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME, - parms->base_reachable_time); - RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime); - RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time); - RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time); - RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay); - RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay); - RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime); - - return RTA_NEST_END(skb, nest); - -rtattr_failure: - return RTA_NEST_CANCEL(skb, nest); -} + nest = nla_nest_start(skb, NDTA_PARMS); + if (nest == NULL) + return -ENOBUFS; -static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + if ((parms->dev && + nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || + nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, + NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || + /* approximative value for deprecated QUEUE_LEN (in packets) */ + nla_put_u32(skb, NDTPA_QUEUE_LEN, + NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || + nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || + nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || + nla_put_u32(skb, NDTPA_UCAST_PROBES, + NEIGH_VAR(parms, UCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_PROBES, + NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || + nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, + NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || + nla_put_msecs(skb, NDTPA_GC_STALETIME, + NEIGH_VAR(parms, GC_STALETIME)) || + nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, + NEIGH_VAR(parms, DELAY_PROBE_TIME)) || + nla_put_msecs(skb, NDTPA_RETRANS_TIME, + NEIGH_VAR(parms, RETRANS_TIME)) || + nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, + NEIGH_VAR(parms, ANYCAST_DELAY)) || + nla_put_msecs(skb, NDTPA_PROXY_DELAY, + NEIGH_VAR(parms, PROXY_DELAY)) || + nla_put_msecs(skb, NDTPA_LOCKTIME, + NEIGH_VAR(parms, LOCKTIME))) + goto nla_put_failure; + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, + u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; - nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg), - NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; - ndtmsg = NLMSG_DATA(nlh); + ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; - RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); - RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); - RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1); - RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2); - RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3); - + if (nla_put_string(skb, NDTA_NAME, tbl->id) || + nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) || + nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || + nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || + nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) + goto nla_put_failure; { unsigned long now = jiffies; unsigned int flush_delta = now - tbl->last_flush; unsigned int rand_delta = now - tbl->last_rand; - + struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_hash_rnd = tbl->hash_rnd, - .ndtc_hash_mask = tbl->hash_mask, - .ndtc_hash_chain_gc = tbl->hash_chain_gc, .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; - RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd[0]; + ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); + rcu_read_unlock_bh(); + + if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) + goto nla_put_failure; } { @@ -1625,7 +1894,7 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb, memset(&ndst, 0, sizeof(ndst)); - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); @@ -1641,312 +1910,477 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb, ndst.ndts_forced_gc_runs += st->forced_gc_runs; } - RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst); + if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) + goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) - goto rtattr_failure; + goto nla_put_failure; read_unlock_bh(&tbl->lock); - return NLMSG_END(skb, nlh); + return nlmsg_end(skb, nlh); -rtattr_failure: +nla_put_failure: read_unlock_bh(&tbl->lock); - return NLMSG_CANCEL(skb, nlh); - -nlmsg_failure: - return -1; + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } -static int neightbl_fill_param_info(struct neigh_table *tbl, +static int neightbl_fill_param_info(struct sk_buff *skb, + struct neigh_table *tbl, struct neigh_parms *parms, - struct sk_buff *skb, - struct netlink_callback *cb) + u32 pid, u32 seq, int type, + unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; - nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg), - NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; - ndtmsg = NLMSG_DATA(nlh); + ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; - RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); - if (neightbl_fill_parms(skb, parms) < 0) - goto rtattr_failure; + if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || + neightbl_fill_parms(skb, parms) < 0) + goto errout; read_unlock_bh(&tbl->lock); - return NLMSG_END(skb, nlh); - -rtattr_failure: + return nlmsg_end(skb, nlh); +errout: read_unlock_bh(&tbl->lock); - return NLMSG_CANCEL(skb, nlh); - -nlmsg_failure: - return -1; + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } - -static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl, - int ifindex) -{ - struct neigh_parms *p; - - for (p = &tbl->parms; p; p = p->next) - if ((p->dev && p->dev->ifindex == ifindex) || - (!p->dev && !ifindex)) - return p; - return NULL; -} +static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { + [NDTA_NAME] = { .type = NLA_STRING }, + [NDTA_THRESH1] = { .type = NLA_U32 }, + [NDTA_THRESH2] = { .type = NLA_U32 }, + [NDTA_THRESH3] = { .type = NLA_U32 }, + [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, + [NDTA_PARMS] = { .type = NLA_NESTED }, +}; -int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { + [NDTPA_IFINDEX] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, + [NDTPA_APP_PROBES] = { .type = NLA_U32 }, + [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, + [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, + [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, + [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, + [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, + [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, + [NDTPA_LOCKTIME] = { .type = NLA_U64 }, +}; + +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); struct neigh_table *tbl; - struct ndtmsg *ndtmsg = NLMSG_DATA(nlh); - struct rtattr **tb = arg; - int err = -EINVAL; + struct ndtmsg *ndtmsg; + struct nlattr *tb[NDTA_MAX+1]; + int err; - if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1])) - return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy); + if (err < 0) + goto errout; + + if (tb[NDTA_NAME] == NULL) { + err = -EINVAL; + goto errout; + } + ndtmsg = nlmsg_data(nlh); read_lock(&neigh_tbl_lock); for (tbl = neigh_tables; tbl; tbl = tbl->next) { if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id)) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) break; } if (tbl == NULL) { err = -ENOENT; - goto errout; + goto errout_locked; } - /* + /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ write_lock_bh(&tbl->lock); - if (tb[NDTA_THRESH1 - 1]) - tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]); - - if (tb[NDTA_THRESH2 - 1]) - tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]); - - if (tb[NDTA_THRESH3 - 1]) - tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]); - - if (tb[NDTA_GC_INTERVAL - 1]) - tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]); - - if (tb[NDTA_PARMS - 1]) { - struct rtattr *tbp[NDTPA_MAX]; + if (tb[NDTA_PARMS]) { + struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; - u32 ifindex = 0; + int i, ifindex = 0; - if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0) - goto rtattr_failure; + err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], + nl_ntbl_parm_policy); + if (err < 0) + goto errout_tbl_lock; - if (tbp[NDTPA_IFINDEX - 1]) - ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]); + if (tbp[NDTPA_IFINDEX]) + ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); - p = lookup_neigh_params(tbl, ifindex); + p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; - goto rtattr_failure; + goto errout_tbl_lock; } - - if (tbp[NDTPA_QUEUE_LEN - 1]) - p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]); - - if (tbp[NDTPA_PROXY_QLEN - 1]) - p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]); - - if (tbp[NDTPA_APP_PROBES - 1]) - p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]); - if (tbp[NDTPA_UCAST_PROBES - 1]) - p->ucast_probes = - RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]); - - if (tbp[NDTPA_MCAST_PROBES - 1]) - p->mcast_probes = - RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]); - - if (tbp[NDTPA_BASE_REACHABLE_TIME - 1]) - p->base_reachable_time = - RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]); + for (i = 1; i <= NDTPA_MAX; i++) { + if (tbp[i] == NULL) + continue; - if (tbp[NDTPA_GC_STALETIME - 1]) - p->gc_staletime = - RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]); + switch (i) { + case NDTPA_QUEUE_LEN: + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN)); + break; + case NDTPA_QUEUE_LENBYTES: + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i])); + break; + case NDTPA_PROXY_QLEN: + NEIGH_VAR_SET(p, PROXY_QLEN, + nla_get_u32(tbp[i])); + break; + case NDTPA_APP_PROBES: + NEIGH_VAR_SET(p, APP_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_UCAST_PROBES: + NEIGH_VAR_SET(p, UCAST_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_MCAST_PROBES: + NEIGH_VAR_SET(p, MCAST_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_BASE_REACHABLE_TIME: + NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_GC_STALETIME: + NEIGH_VAR_SET(p, GC_STALETIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_DELAY_PROBE_TIME: + NEIGH_VAR_SET(p, DELAY_PROBE_TIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_RETRANS_TIME: + NEIGH_VAR_SET(p, RETRANS_TIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_ANYCAST_DELAY: + NEIGH_VAR_SET(p, ANYCAST_DELAY, + nla_get_msecs(tbp[i])); + break; + case NDTPA_PROXY_DELAY: + NEIGH_VAR_SET(p, PROXY_DELAY, + nla_get_msecs(tbp[i])); + break; + case NDTPA_LOCKTIME: + NEIGH_VAR_SET(p, LOCKTIME, + nla_get_msecs(tbp[i])); + break; + } + } + } - if (tbp[NDTPA_DELAY_PROBE_TIME - 1]) - p->delay_probe_time = - RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]); + err = -ENOENT; + if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || + tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && + !net_eq(net, &init_net)) + goto errout_tbl_lock; - if (tbp[NDTPA_RETRANS_TIME - 1]) - p->retrans_time = - RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]); + if (tb[NDTA_THRESH1]) + tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); - if (tbp[NDTPA_ANYCAST_DELAY - 1]) - p->anycast_delay = - RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]); + if (tb[NDTA_THRESH2]) + tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); - if (tbp[NDTPA_PROXY_DELAY - 1]) - p->proxy_delay = - RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]); + if (tb[NDTA_THRESH3]) + tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); - if (tbp[NDTPA_LOCKTIME - 1]) - p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]); - } + if (tb[NDTA_GC_INTERVAL]) + tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); err = 0; -rtattr_failure: +errout_tbl_lock: write_unlock_bh(&tbl->lock); -errout: +errout_locked: read_unlock(&neigh_tbl_lock); +errout: return err; } -int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { - int idx, family; - int s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family, tidx, nidx = 0; + int tbl_skip = cb->args[0]; + int neigh_skip = cb->args[1]; struct neigh_table *tbl; - family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) { + for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { struct neigh_parms *p; - if (idx < s_idx || (family && tbl->family != family)) + if (tidx < tbl_skip || (family && tbl->family != family)) continue; - if (neightbl_fill_info(tbl, skb, cb) <= 0) + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) break; - for (++idx, p = tbl->parms.next; p; p = p->next, idx++) { - if (idx < s_idx) + for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + if (!net_eq(neigh_parms_net(p), net)) continue; - if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0) + if (nidx < neigh_skip) + goto next; + + if (neightbl_fill_param_info(skb, tbl, p, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) goto out; + next: + nidx++; } + neigh_skip = 0; } out: read_unlock(&neigh_tbl_lock); - cb->args[0] = idx; + cb->args[0] = tidx; + cb->args[1] = nidx; return skb->len; } -static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, - u32 pid, u32 seq, int event, unsigned int flags) +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) { unsigned long now = jiffies; - unsigned char *b = skb->tail; struct nda_cacheinfo ci; - int locked = 0; - u32 probes; - struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event, - sizeof(struct ndmsg), flags); - struct ndmsg *ndm = NLMSG_DATA(nlh); + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; - ndm->ndm_family = n->ops->family; + ndm = nlmsg_data(nlh); + ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = n->flags; - ndm->ndm_type = n->type; - ndm->ndm_ifindex = n->dev->ifindex; - RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); - read_lock_bh(&n->lock); - locked = 1; - ndm->ndm_state = n->nud_state; - if (n->nud_state & NUD_VALID) - RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); - ci.ndm_used = now - n->used; - ci.ndm_confirmed = now - n->confirmed; - ci.ndm_updated = now - n->updated; - ci.ndm_refcnt = atomic_read(&n->refcnt) - 1; - probes = atomic_read(&n->probes); - read_unlock_bh(&n->lock); - locked = 0; - RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); - RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes); - nlh->nlmsg_len = skb->tail - b; - return skb->len; + ndm->ndm_flags = neigh->flags; + ndm->ndm_type = neigh->type; + ndm->ndm_ifindex = neigh->dev->ifindex; + + if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) + goto nla_put_failure; + + read_lock_bh(&neigh->lock); + ndm->ndm_state = neigh->nud_state; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } + } + + ci.ndm_used = jiffies_to_clock_t(now - neigh->used); + ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); + ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); + ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + read_unlock_bh(&neigh->lock); + + if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || + nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, + u32 pid, u32 seq, int type, unsigned int flags, + struct neigh_table *tbl) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; -nlmsg_failure: -rtattr_failure: - if (locked) - read_unlock_bh(&n->lock); - skb_trim(skb, b - skb->data); - return -1; + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = tbl->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = pn->flags | NTF_PROXY; + ndm->ndm_type = RTN_UNICAST; + ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_state = NUD_NONE; + + if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } +static void neigh_update_notify(struct neighbour *neigh) +{ + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + __neigh_notify(neigh, RTM_NEWNEIGH, 0); +} static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; - for (h = 0; h <= tbl->hash_mask; h++) { - if (h < s_h) - continue; + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { - if (idx < s_idx) + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!net_eq(dev_net(n->dev), net)) continue; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + if (idx < s_idx) + goto next; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { - read_unlock_bh(&tbl->lock); rc = -1; goto out; } +next: + idx++; } - read_unlock_bh(&tbl->lock); } rc = skb->len; out: + rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; } -int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct pneigh_entry *n; + struct net *net = sock_net(skb->sk); + int rc, h, s_h = cb->args[3]; + int idx, s_idx = idx = cb->args[4]; + + read_lock_bh(&tbl->lock); + + for (h = s_h; h <= PNEIGH_HASHMASK; h++) { + if (h > s_h) + s_idx = 0; + for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + if (dev_net(n->dev) != net) + continue; + if (idx < s_idx) + goto next; + if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, tbl) <= 0) { + read_unlock_bh(&tbl->lock); + rc = -1; + goto out; + } + next: + idx++; + } + } + + read_unlock_bh(&tbl->lock); + rc = skb->len; +out: + cb->args[3] = h; + cb->args[4] = idx; + return rc; + +} + +static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { struct neigh_table *tbl; int t, family, s_t; + int proxy = 0; + int err; read_lock(&neigh_tbl_lock); - family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + + /* check for full ndmsg structure presence, family member is + * the same for both structures + */ + if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) + proxy = 1; + s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + for (tbl = neigh_tables, t = 0; tbl; + tbl = tbl->next, t++) { if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (neigh_dump_table(tbl, skb, cb) < 0) + if (proxy) + err = pneigh_dump_table(tbl, skb, cb); + else + err = neigh_dump_table(tbl, skb, cb); + if (err < 0) break; } read_unlock(&neigh_tbl_lock); @@ -1958,15 +2392,22 @@ int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (chain = 0; chain <= tbl->hash_mask; chain++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); /* avoid resizes */ + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = tbl->hash_buckets[chain]; n; n = n->next) + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); @@ -1975,24 +2416,31 @@ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; + struct neigh_hash_table *nht; - for (chain = 0; chain <= tbl->hash_mask; chain++) { - struct neighbour *n, **np; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct neighbour *n; + struct neighbour __rcu **np; - np = &tbl->hash_buckets[chain]; - while ((n = *np) != NULL) { + np = &nht->hash_buckets[chain]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; write_unlock(&n->lock); if (release) - neigh_release(n); + neigh_cleanup_and_release(n); } } } @@ -2003,15 +2451,18 @@ EXPORT_SYMBOL(__neigh_for_each_release); static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { - n = tbl->hash_buckets[bucket]; + for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { + n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; @@ -2024,8 +2475,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) break; if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2041,17 +2492,20 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } - n = n->next; + n = rcu_dereference_bh(n->next); while (1) { while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) @@ -2063,17 +2517,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) break; - if (++state->bucket > tbl->hash_mask) + if (++state->bucket >= (1 << nht->hash_shift)) break; - n = tbl->hash_buckets[state->bucket]; + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) @@ -2086,6 +2540,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) struct neighbour *n = neigh_get_first(seq); if (n) { + --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) @@ -2098,6 +2553,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket = state->bucket; @@ -2105,6 +2561,8 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = tbl->phash_buckets[bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; if (pn) break; } @@ -2118,13 +2576,19 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; - pn = pn->next; + do { + pn = pn->next; + } while (pn && !net_eq(pneigh_net(pn), net)); + while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = tbl->phash_buckets[state->bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; if (pn) break; } @@ -2140,6 +2604,7 @@ static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { + --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) @@ -2153,27 +2618,28 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; + loff_t idxpos = *pos; - rc = neigh_get_idx(seq, pos); + rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) - rc = pneigh_get_idx(seq, pos); + rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; - loff_t pos_minus_one; state->tbl = tbl; state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - read_lock_bh(&tbl->lock); + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); - pos_minus_one = *pos - 1; - return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN; + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2183,7 +2649,7 @@ void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) void *rc; if (v == SEQ_START_TOKEN) { - rc = neigh_get_idx(seq, pos); + rc = neigh_get_first(seq); goto out; } @@ -2205,11 +2671,9 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(rcu_bh) { - struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; - - read_unlock_bh(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -2217,14 +2681,13 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; if (*pos == 0) return SEQ_START_TOKEN; - - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -2235,11 +2698,10 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -2255,17 +2717,16 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs\n"); + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx\n", + "%08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2281,13 +2742,14 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->rcv_probes_ucast, st->periodic_gc_runs, - st->forced_gc_runs + st->forced_gc_runs, + st->unres_discards ); return 0; } -static struct seq_operations neigh_stat_seq_ops = { +static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, @@ -2300,12 +2762,12 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode); + sf->private = PDE_DATA(inode); } return ret; }; -static struct file_operations neigh_stat_seq_fops = { +static const struct file_operations neigh_stat_seq_fops = { .owner = THIS_MODULE, .open = neigh_stat_seq_open, .read = seq_read, @@ -2315,357 +2777,364 @@ static struct file_operations neigh_stat_seq_fops = { #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_ARPD -void neigh_app_ns(struct neighbour *n) +static inline size_t neigh_nlmsg_size(void) { - struct nlmsghdr *nlh; - int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); - struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4); /* NDA_PROBES */ +} - if (!skb) - return; +static void __neigh_notify(struct neighbour *n, int type, int flags) +{ + struct net *net = dev_net(n->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; - if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) { + err = neigh_fill_info(skb, n, 0, 0, type, flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - return; + goto errout; } - nlh = (struct nlmsghdr *)skb->data; - nlh->nlmsg_flags = NLM_F_REQUEST; - NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static void neigh_app_notify(struct neighbour *n) +void neigh_app_ns(struct neighbour *n) { - struct nlmsghdr *nlh; - int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); - struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); +} +EXPORT_SYMBOL(neigh_app_ns); - if (!skb) - return; +#ifdef CONFIG_SYSCTL +static int zero; +static int int_max = INT_MAX; +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); - if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) { - kfree_skb(skb); - return; +static int proc_unres_qlen(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int size, ret; + struct ctl_table tmp = *ctl; + + tmp.extra1 = &zero; + tmp.extra2 = &unres_qlen_max; + tmp.data = &size; + + size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && !ret) + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); + return ret; +} + +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); } - nlh = (struct nlmsghdr *)skb->data; - NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); + return NULL; } -#endif /* CONFIG_ARPD */ +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, + int index) +{ + struct net_device *dev; + int family = neigh_parms_family(p); -#ifdef CONFIG_SYSCTL + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct neigh_parms *dst_p = + neigh_get_dev_parms_rcu(dev, family); + + if (dst_p && !test_bit(index, dst_p->data_state)) + dst_p->data[index] = p->data[index]; + } + rcu_read_unlock(); +} + +static void neigh_proc_update(struct ctl_table *ctl, int write) +{ + struct net_device *dev = ctl->extra1; + struct neigh_parms *p = ctl->extra2; + struct net *net = neigh_parms_net(p); + int index = (int *) ctl->data - p->data; + + if (!write) + return; + + set_bit(index, p->data_state); + if (!dev) /* NULL dev means this is default value */ + neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + tmp.extra1 = &zero; + tmp.extra2 = &int_max; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index) \ + (&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ + [NEIGH_VAR_ ## attr] = { \ + .procname = name, \ + .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + } + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table neigh_vars[__NET_NEIGH_MAX]; - ctl_table neigh_dev[2]; - ctl_table neigh_neigh_dir[2]; - ctl_table neigh_proto_dir[2]; - ctl_table neigh_root_dir[2]; -} neigh_sysctl_template = { + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; +} neigh_sysctl_template __read_mostly = { .neigh_vars = { - { - .ctl_name = NET_NEIGH_MCAST_SOLICIT, - .procname = "mcast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_UCAST_SOLICIT, - .procname = "ucast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_APP_SOLICIT, - .procname = "app_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_RETRANS_TIME, - .procname = "retrans_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, - }, - { - .ctl_name = NET_NEIGH_REACHABLE_TIME, - .procname = "base_reachable_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_NEIGH_DELAY_PROBE_TIME, - .procname = "delay_first_probe_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_NEIGH_GC_STALE_TIME, - .procname = "gc_stale_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_NEIGH_UNRES_QLEN, - .procname = "unres_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_PROXY_QLEN, - .procname = "proxy_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_ANYCAST_DELAY, - .procname = "anycast_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, - }, - { - .ctl_name = NET_NEIGH_PROXY_DELAY, - .procname = "proxy_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, - }, - { - .ctl_name = NET_NEIGH_LOCKTIME, - .procname = "locktime", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, - }, - { - .ctl_name = NET_NEIGH_GC_INTERVAL, + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), + NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), + [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = NET_NEIGH_GC_THRESH1, + [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, - { - .ctl_name = NET_NEIGH_GC_THRESH2, + [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, - { - .ctl_name = NET_NEIGH_GC_THRESH3, + [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NEIGH_RETRANS_TIME_MS, - .procname = "retrans_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, - }, - { - .ctl_name = NET_NEIGH_REACHABLE_TIME_MS, - .procname = "base_reachable_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, - }, - }, - .neigh_dev = { - { - .ctl_name = NET_PROTO_CONF_DEFAULT, - .procname = "default", - .mode = 0555, - }, - }, - .neigh_neigh_dir = { - { - .procname = "neigh", - .mode = 0555, - }, - }, - .neigh_proto_dir = { - { - .mode = 0555, - }, - }, - .neigh_root_dir = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, + .extra1 = &zero, + .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, }, + {}, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, - int p_id, int pdev_id, char *p_name, - proc_handler *handler, ctl_handler *strategy) + proc_handler *handler) { - struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); - const char *dev_name_source = NULL; - char *dev_name = NULL; - int err = 0; + int i; + struct neigh_sysctl_table *t; + const char *dev_name_source; + char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; + char *p_name; + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); if (!t) - return -ENOBUFS; - memcpy(t, &neigh_sysctl_template, sizeof(*t)); - t->neigh_vars[0].data = &p->mcast_probes; - t->neigh_vars[1].data = &p->ucast_probes; - t->neigh_vars[2].data = &p->app_probes; - t->neigh_vars[3].data = &p->retrans_time; - t->neigh_vars[4].data = &p->base_reachable_time; - t->neigh_vars[5].data = &p->delay_probe_time; - t->neigh_vars[6].data = &p->gc_staletime; - t->neigh_vars[7].data = &p->queue_len; - t->neigh_vars[8].data = &p->proxy_qlen; - t->neigh_vars[9].data = &p->anycast_delay; - t->neigh_vars[10].data = &p->proxy_delay; - t->neigh_vars[11].data = &p->locktime; + goto err; + + for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { + t->neigh_vars[i].data += (long) p; + t->neigh_vars[i].extra1 = dev; + t->neigh_vars[i].extra2 = p; + } if (dev) { dev_name_source = dev->name; - t->neigh_dev[0].ctl_name = dev->ifindex; - t->neigh_vars[12].procname = NULL; - t->neigh_vars[13].procname = NULL; - t->neigh_vars[14].procname = NULL; - t->neigh_vars[15].procname = NULL; + /* Terminate the table early */ + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { - dev_name_source = t->neigh_dev[0].procname; - t->neigh_vars[12].data = (int *)(p + 1); - t->neigh_vars[13].data = (int *)(p + 1) + 1; - t->neigh_vars[14].data = (int *)(p + 1) + 2; - t->neigh_vars[15].data = (int *)(p + 1) + 3; + struct neigh_table *tbl = p->tbl; + dev_name_source = "default"; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } - t->neigh_vars[16].data = &p->retrans_time; - t->neigh_vars[17].data = &p->base_reachable_time; - - if (handler || strategy) { + if (handler) { /* RetransTime */ - t->neigh_vars[3].proc_handler = handler; - t->neigh_vars[3].strategy = strategy; - t->neigh_vars[3].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ - t->neigh_vars[4].proc_handler = handler; - t->neigh_vars[4].strategy = strategy; - t->neigh_vars[4].extra1 = dev; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ - t->neigh_vars[16].proc_handler = handler; - t->neigh_vars[16].strategy = strategy; - t->neigh_vars[16].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ - t->neigh_vars[17].proc_handler = handler; - t->neigh_vars[17].strategy = strategy; - t->neigh_vars[17].extra1 = dev; - } - - dev_name = kstrdup(dev_name_source, GFP_KERNEL); - if (!dev_name) { - err = -ENOBUFS; - goto free; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } - t->neigh_dev[0].procname = dev_name; - - t->neigh_neigh_dir[0].ctl_name = pdev_id; + /* Don't export sysctls to unprivileged users */ + if (neigh_parms_net(p)->user_ns != &init_user_ns) + t->neigh_vars[0].procname = NULL; - t->neigh_proto_dir[0].procname = p_name; - t->neigh_proto_dir[0].ctl_name = p_id; + switch (neigh_parms_family(p)) { + case AF_INET: + p_name = "ipv4"; + break; + case AF_INET6: + p_name = "ipv6"; + break; + default: + BUG(); + } - t->neigh_dev[0].child = t->neigh_vars; - t->neigh_neigh_dir[0].child = t->neigh_dev; - t->neigh_proto_dir[0].child = t->neigh_neigh_dir; - t->neigh_root_dir[0].child = t->neigh_proto_dir; + snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", + p_name, dev_name_source); + t->sysctl_header = + register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); + if (!t->sysctl_header) + goto free; - t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); - if (!t->sysctl_header) { - err = -ENOBUFS; - goto free_procname; - } p->sysctl_table = t; return 0; - /* error path */ - free_procname: - kfree(dev_name); - free: +free: kfree(t); - - return err; +err: + return -ENOBUFS; } +EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->neigh_dev[0].procname); + unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } +EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ -EXPORT_SYMBOL(__neigh_event_send); -EXPORT_SYMBOL(neigh_add); -EXPORT_SYMBOL(neigh_changeaddr); -EXPORT_SYMBOL(neigh_compat_output); -EXPORT_SYMBOL(neigh_connected_output); -EXPORT_SYMBOL(neigh_create); -EXPORT_SYMBOL(neigh_delete); -EXPORT_SYMBOL(neigh_destroy); -EXPORT_SYMBOL(neigh_dump_info); -EXPORT_SYMBOL(neigh_event_ns); -EXPORT_SYMBOL(neigh_ifdown); -EXPORT_SYMBOL(neigh_lookup); -EXPORT_SYMBOL(neigh_lookup_nodev); -EXPORT_SYMBOL(neigh_parms_alloc); -EXPORT_SYMBOL(neigh_parms_release); -EXPORT_SYMBOL(neigh_rand_reach_time); -EXPORT_SYMBOL(neigh_resolve_output); -EXPORT_SYMBOL(neigh_table_clear); -EXPORT_SYMBOL(neigh_table_init); -EXPORT_SYMBOL(neigh_update); -EXPORT_SYMBOL(neigh_update_hhs); -EXPORT_SYMBOL(pneigh_enqueue); -EXPORT_SYMBOL(pneigh_lookup); -EXPORT_SYMBOL(neightbl_dump_info); -EXPORT_SYMBOL(neightbl_set); +static int __init neigh_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, + NULL); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL); + + return 0; +} + +subsys_initcall(neigh_init); -#ifdef CONFIG_ARPD -EXPORT_SYMBOL(neigh_app_ns); -#endif -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(neigh_sysctl_register); -EXPORT_SYMBOL(neigh_sysctl_unregister); -#endif |
