diff options
Diffstat (limited to 'net/core')
33 files changed, 8427 insertions, 4176 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 796f46eece5..8a04dd22cf7 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -6,9 +6,8 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-$(CONFIG_HAS_DMA) += skb_dma_map.o -obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ +obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o obj-$(CONFIG_XFRM) += flow.o @@ -19,4 +18,4 @@ obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o - +obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 58abee1f1df..18ac112ea7a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -48,6 +48,7 @@ #include <linux/poll.h> #include <linux/highmem.h> #include <linux/spinlock.h> +#include <linux/slab.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -55,6 +56,7 @@ #include <net/checksum.h> #include <net/sock.h> #include <net/tcp_states.h> +#include <trace/events/skb.h> /* * Is a socket 'connection oriented' ? @@ -84,7 +86,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); - prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); @@ -113,7 +115,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) error = 0; *timeo_p = schedule_timeout(*timeo_p); out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return error; interrupted: error = sock_intr_errno(*timeo_p); @@ -175,7 +177,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, * interrupt level will suddenly eat the receive_queue. * * Look at current nfs client by the way... - * However, this function was corrent in any case. 8) + * However, this function was correct in any case. 8) */ unsigned long cpu_flags; @@ -217,12 +219,34 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, err); } +EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); sk_mem_reclaim_partial(sk); } +EXPORT_SYMBOL(skb_free_datagram); + +void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +{ + bool slow; + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + + slow = lock_sock_fast(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_fast(sk, slow); + + /* skb is now orphaned, can be freed outside of locked section */ + trace_kfree_skb(skb, skb_free_datagram_locked); + __kfree_skb(skb); +} +EXPORT_SYMBOL(skb_free_datagram_locked); /** * skb_kill_datagram - Free a datagram skbuff forcibly @@ -261,11 +285,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } kfree_skb(skb); + atomic_inc(&sk->sk_drops); sk_mem_reclaim_partial(sk); return err; } - EXPORT_SYMBOL(skb_kill_datagram); /** @@ -284,6 +308,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, int i, copy = start - offset; struct sk_buff *frag_iter; + trace_skb_copy_datagram_iovec(skb, len); + /* Copy header. */ if (copy > 0) { if (copy > len) @@ -348,6 +374,7 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, fault: return -EFAULT; } +EXPORT_SYMBOL(skb_copy_datagram_iovec); /** * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. @@ -691,6 +718,7 @@ csum_error: fault: return -EFAULT; } +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); /** * datagram_poll - generic datagram poll @@ -712,20 +740,19 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -745,9 +772,4 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, return mask; } - EXPORT_SYMBOL(datagram_poll); -EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); -EXPORT_SYMBOL(skb_copy_datagram_iovec); -EXPORT_SYMBOL(skb_free_datagram); -EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index baf2dc13a34..a3ef808b5e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,8 @@ #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h> #include <linux/sched.h> #include <linux/mutex.h> #include <linux/string.h> @@ -99,11 +101,10 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> -#include <linux/if_bridge.h> -#include <linux/if_macvlan.h> #include <net/dst.h> #include <net/pkt_sched.h> #include <net/checksum.h> +#include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/kmod.h> @@ -127,6 +128,10 @@ #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> +#include <linux/pci.h> +#include <linux/inetdevice.h> #include "net-sysfs.h" @@ -175,7 +180,7 @@ static struct list_head ptype_all __read_mostly; /* Taps */ * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the @@ -191,21 +196,31 @@ static struct list_head ptype_all __read_mostly; /* Taps */ * semaphore held. */ DEFINE_RWLOCK(dev_base_lock); - EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_lock(&sd->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_unlock(&sd->input_pkt_queue.lock); +#endif } /* Device list insertion */ @@ -216,23 +231,26 @@ static int list_netdevice(struct net_device *dev) ASSERT_RTNL(); write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &net->dev_base_head); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); return 0; } -/* Device list removal */ +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ static void unlist_netdevice(struct net_device *dev) { ASSERT_RTNL(); /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); + list_del_rcu(&dev->dev_list); + hlist_del_rcu(&dev->name_hlist); + hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -247,7 +265,8 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data); #ifdef CONFIG_LOCKDEP /* @@ -269,10 +288,10 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, - ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY, + ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; -static const char *netdev_lock_name[] = +static const char *const netdev_lock_name[] = {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", @@ -287,7 +306,7 @@ static const char *netdev_lock_name[] = "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", - "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY", + "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -355,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -370,17 +397,13 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } +EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler @@ -397,15 +420,10 @@ void dev_add_pack(struct packet_type *pt) */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -416,8 +434,10 @@ void __dev_remove_pack(struct packet_type *pt) printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } +EXPORT_SYMBOL(__dev_remove_pack); + /** * dev_remove_pack - remove packet handler * @pt: packet type declaration @@ -436,6 +456,7 @@ void dev_remove_pack(struct packet_type *pt) synchronize_net(); } +EXPORT_SYMBOL(dev_remove_pack); /****************************************************************************** @@ -499,6 +520,7 @@ int netdev_boot_setup_check(struct net_device *dev) } return 0; } +EXPORT_SYMBOL(netdev_boot_setup_check); /** @@ -582,15 +604,42 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each(p, dev_name_hash(net, name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); + hlist_for_each_entry(dev, p, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; - } + return NULL; } +EXPORT_SYMBOL(__dev_get_by_name); + +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); /** * dev_get_by_name - find a device by its name @@ -608,13 +657,14 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } +EXPORT_SYMBOL(dev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -631,15 +681,41 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each(p, dev_index_hash(net, ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); + hlist_for_each_entry(dev, p, head, index_hlist) if (dev->ifindex == ifindex) return dev; - } + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_index); + +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + return NULL; } +EXPORT_SYMBOL(dev_get_by_index_rcu); /** @@ -657,44 +733,41 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_getbyhwaddr - find a device by its hardware address + * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased + * is not found or a pointer to the device. The caller must hold RCU + * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; - ASSERT_RTNL(); - - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } - -EXPORT_SYMBOL(dev_getbyhwaddr); +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { @@ -707,51 +780,50 @@ struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) return NULL; } - EXPORT_SYMBOL(__dev_getfirstbyhwtype); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { - struct net_device *dev; + struct net_device *dev, *ret = NULL; - rtnl_lock(); - dev = __dev_getfirstbyhwtype(net, type); - if (dev) - dev_hold(dev); - rtnl_unlock(); - return dev; + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + if (dev->type == type) { + dev_hold(dev); + ret = dev; + break; + } + rcu_read_unlock(); + return ret; } - EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags - find any device with given flags + * dev_get_by_flags_rcu - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * is not found or a pointer to the device. Must be called inside + * rcu_read_lock(), and result refcount is unchanged. */ -struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; ret = NULL; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { - dev_hold(dev); ret = dev; break; } } - read_unlock(&dev_base_lock); return ret; } +EXPORT_SYMBOL(dev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device @@ -777,6 +849,7 @@ int dev_valid_name(const char *name) } return 1; } +EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device @@ -832,7 +905,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - snprintf(buf, IFNAMSIZ, name, i); + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; @@ -870,7 +944,27 @@ int dev_alloc_name(struct net_device *dev, const char *name) strlcpy(dev->name, buf, IFNAMSIZ); return ret; } +EXPORT_SYMBOL(dev_alloc_name); +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) +{ + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (fmt && strchr(name, '%')) + return dev_alloc_name(dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; +} /** * dev_change_name - change name of a device @@ -894,53 +988,45 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (!dev_valid_name(newname)) - return -EINVAL; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) return 0; memcpy(oldname, dev->name, IFNAMSIZ); - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - } - else if (__dev_get_by_name(net, newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); + err = dev_get_valid_name(dev, newname, 1); + if (err < 0) + return err; rollback: - /* For now only devices in the initial network namespace - * are in sysfs. - */ - if (net == &init_net) { - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; - } + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; } write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { - if (err) { - printk(KERN_ERR - "%s: name change rollback failed: %d.\n", - dev->name, ret); - } else { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { err = ret; memcpy(dev->name, oldname, IFNAMSIZ); goto rollback; + } else { + printk(KERN_ERR + "%s: name change rollback failed: %d.\n", + dev->name, ret); } } @@ -970,7 +1056,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return 0; } - dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL); + dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); if (!dev->ifalias) return -ENOMEM; @@ -1006,10 +1092,11 @@ void netdev_state_change(struct net_device *dev) rtmsg_ifinfo(RTM_NEWLINK, dev, 0); } } +EXPORT_SYMBOL(netdev_state_change); -void netdev_bonding_change(struct net_device *dev) +int netdev_bonding_change(struct net_device *dev, unsigned long event) { - call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); + return call_netdevice_notifiers(event, dev); } EXPORT_SYMBOL(netdev_bonding_change); @@ -1027,27 +1114,16 @@ void dev_load(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - read_unlock(&dev_base_lock); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); - if (!dev && capable(CAP_SYS_MODULE)) + if (!dev && capable(CAP_NET_ADMIN)) request_module("%s", name); } +EXPORT_SYMBOL(dev_load); -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1055,13 +1131,6 @@ int dev_open(struct net_device *dev) ASSERT_RTNL(); /* - * Is it already up? - */ - - if (dev->flags & IFF_UP) - return 0; - - /* * Is it even present? */ if (!netif_device_present(dev)) @@ -1109,81 +1178,156 @@ int dev_open(struct net_device *dev) * Wakeup transmit queue engine */ dev_activate(dev); - - /* - * ... and announce new interface. - */ - call_netdevice_notifiers(NETDEV_UP, dev); } return ret; } /** - * dev_close - shutdown an interface. - * @dev: device to shutdown + * dev_open - prepare an interface for use. + * @dev: device to open * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. */ -int dev_close(struct net_device *dev) +int dev_open(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; - ASSERT_RTNL(); - - might_sleep(); + int ret; - if (!(dev->flags & IFF_UP)) + /* + * Is it already up? + */ + if (dev->flags & IFF_UP) return 0; /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. + * Open device */ - call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); - - clear_bit(__LINK_STATE_START, &dev->state); + ret = __dev_open(dev); + if (ret < 0) + return ret; - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(). - * - * dev->stop() will invoke napi_disable() on all of it's - * napi_struct instances on this device. + /* + * ... and announce new interface. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_UP, dev); - dev_deactivate(dev); + return ret; +} +EXPORT_SYMBOL(dev_open); - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (ops->ndo_stop) - ops->ndo_stop(dev); +static int __dev_close_many(struct list_head *head) +{ + struct net_device *dev; - /* - * Device is now down. - */ + ASSERT_RTNL(); + might_sleep(); - dev->flags &= ~IFF_UP; + list_for_each_entry(dev, head, unreg_list) { + /* + * Tell people we are going down, so that they can + * prepare to death, when device is still operating. + */ + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + } + + dev_deactivate_many(head); + + list_for_each_entry(dev, head, unreg_list) { + const struct net_device_ops *ops = dev->netdev_ops; + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); + + /* + * Device is now down. + */ + + dev->flags &= ~IFF_UP; + + /* + * Shutdown NET_DMA + */ + net_dmaengine_put(); + } + + return 0; +} + +static int __dev_close(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + return __dev_close_many(&single); +} + +int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) + if (!(dev->flags & IFF_UP)) + list_move(&dev->unreg_list, &tmp_list); + + __dev_close_many(head); /* * Tell people we are down */ - call_netdevice_notifiers(NETDEV_DOWN, dev); + list_for_each_entry(dev, head, unreg_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); + } - /* - * Shutdown NET_DMA - */ - net_dmaengine_put(); + /* rollback_registered_many needs the complete original list */ + list_splice(&tmp_list, head); + return 0; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + dev_close_many(&single); return 0; } +EXPORT_SYMBOL(dev_close); /** @@ -1273,12 +1417,14 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } +EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block @@ -1299,6 +1445,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) rtnl_unlock(); return err; } +EXPORT_SYMBOL(unregister_netdevice_notifier); /** * call_netdevice_notifiers - call all network notifier blocks @@ -1311,6 +1458,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } @@ -1321,13 +1469,15 @@ void net_enable_timestamp(void) { atomic_inc(&netstamp_needed); } +EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { atomic_dec(&netstamp_needed); } +EXPORT_SYMBOL(net_disable_timestamp); -static inline void net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) __net_timestamp(skb); @@ -1335,6 +1485,57 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +static inline void net_timestamp_check(struct sk_buff *skb) +{ + if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) + __net_timestamp(skb); +} + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + skb_orphan(skb); + nf_reset(skb); + + if (unlikely(!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + skb_set_dev(skb, dev); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1343,13 +1544,8 @@ static inline void net_timestamp(struct sk_buff *skb) static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - -#ifdef CONFIG_NET_CLS_ACT - if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp(skb); -#else - net_timestamp(skb); -#endif + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1359,10 +1555,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) if ((ptype->dev == dev || !ptype->dev) && (ptype->af_packet_priv == NULL || (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) break; + net_timestamp_set(skb2); + /* skb->nh should be correctly set by sender, so that the second statement is just protection against buggy protocols. @@ -1374,18 +1578,81 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) if (net_ratelimit()) printk(KERN_CRIT "protocol %04x is " "buggy, dev %s\n", - skb2->protocol, dev->name); + ntohs(skb2->protocol), + dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype, skb->dev); + pt_prev = ptype; } } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + int rc; + + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); + } + + dev->real_num_tx_queues = txq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it always + * succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif static inline void __netif_reschedule(struct Qdisc *q) { @@ -1394,8 +1661,9 @@ static inline void __netif_reschedule(struct Qdisc *q) local_irq_save(flags); sd = &__get_cpu_var(softnet_data); - q->next_sched = sd->output_queue; - sd->output_queue = q; + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -1464,31 +1732,35 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -{ - if (can_checksum_protocol(dev->features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif } - - return false; + skb->dev = dev; } +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ /* * Invalidate hardware checksum when packet is to be mangled, and @@ -1507,7 +1779,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out_set_summed; } - offset = skb->csum_start - skb_headroom(skb); + offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1527,6 +1799,7 @@ out_set_summed: out: return ret; } +EXPORT_SYMBOL(skb_checksum_help); /** * skb_gso_segment - Perform segmentation on skb. @@ -1543,8 +1816,20 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; int err; + while (type == htons(ETH_P_8021Q)) { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) + return ERR_PTR(-EINVAL); + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } + skb_reset_mac_header(skb); skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); @@ -1556,8 +1841,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " - "ip_summed=%d", + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", info.driver, dev ? dev->features : 0L, skb->sk ? skb->sk->sk_route_caps : 0L, skb->len, skb->data_len, skb->ip_summed); @@ -1589,7 +1873,6 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) return segs; } - EXPORT_SYMBOL(skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ @@ -1610,18 +1893,27 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + } - if (dev->features & NETIF_F_HIGHDMA) - return 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) - return 1; + if (PCI_DMA_BUS_IS_PHYS) { + struct device *pdev = dev->dev.parent; + if (!pdev) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + return 1; + } + } #endif return 0; } @@ -1652,16 +1944,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) /** * dev_gso_segment - Perform emulated hardware segmentation on skb. * @skb: buffer to segment + * @features: device features as applicable to this skb * * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb) +static int dev_gso_segment(struct sk_buff *skb, int features) { - struct net_device *dev = skb->dev; struct sk_buff *segs; - int features = dev->features & ~(illegal_highdma(dev, skb) ? - NETIF_F_SG : 0); segs = skb_gso_segment(skb, features); @@ -1679,22 +1969,97 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +/* + * Try to orphan skb early, right before transmission by the device. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c + */ +static inline void skb_orphan_try(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (sk && !skb_shinfo(skb)->tx_flags) { + /* skb_tx_hash() wont be able to get sk. + * We copy sk_hash into skb->rxhash + */ + if (!skb->rxhash) + skb->rxhash = sk->sk_hash; + skb_orphan(skb); + } +} + +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_V4_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_V6_CSUM) && + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); +} + +static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features) +{ + if (!can_checksum_protocol(protocol, features)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +int netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + int features = skb->dev->features; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } + + features &= skb->dev->vlan_features; + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM; + return harmonize_features(skb, protocol, features); + } +} +EXPORT_SYMBOL(netif_skb_features); + +/* + * Returns true if either: + * 1. skb has frag_list and the device doesn't support FRAGLIST, or + * 2. skb is fragmented and the device does not support SG, or if + * at least one of fragments is in highmem and device does not + * support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, + int features) +{ + return skb_is_nonlinear(skb) && + ((skb_has_frag_list(skb) && + !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && + !(features & NETIF_F_SG))); +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; - int rc; + int rc = NETDEV_TX_OK; if (likely(!skb->next)) { - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } + int features; /* * If device doesnt need skb->dst, release it right now while @@ -1703,23 +2068,49 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + skb_orphan_try(skb); + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !(features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } + } + rc = ops->ndo_start_xmit(skb, dev); - if (rc == 0) + trace_net_dev_xmit(skb, rc); + if (rc == NETDEV_TX_OK) txq_trans_update(txq); - /* - * TODO: if skb_orphan() was called by - * dev->hard_start_xmit() (for example, the unmodified - * igb driver does that; bnx2 doesn't), then - * skb_tx_software_timestamp() will be unable to send - * back the time stamp. - * - * How can this be prevented? Always create another - * reference to the socket before calling - * dev->hard_start_xmit()? Prevent that skb_orphan() - * does anything in dev->hard_start_xmit() by clearing - * the skb destructor before the call and restoring it - * afterwards, then doing the skb_orphan() ourselves? - */ return rc; } @@ -1729,8 +2120,19 @@ gso: skb->next = nskb->next; nskb->next = NULL; + + /* + * If device doesnt need nskb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(nskb); + rc = ops->ndo_start_xmit(nskb, dev); - if (unlikely(rc)) { + trace_net_dev_xmit(nskb, rc); + if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; nskb->next = skb->next; skb->next = nskb; return rc; @@ -1740,52 +2142,195 @@ gso: return NETDEV_TX_BUSY; } while (skb->next); - skb->destructor = DEV_GSO_CB(skb)->destructor; - +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); - return 0; +out: + return rc; } -static u32 skb_tx_hashrnd; +static u32 hashrnd __read_mostly; -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) { u32 hash; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely (hash >= dev->real_num_tx_queues)) - hash -= dev->real_num_tx_queues; + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; return hash; } if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = skb->protocol; + hash = (__force u16) skb->protocol ^ skb->rxhash; + hash = jhash_1word(hash, hashrnd); + + return (u16) (((u64) hash * num_tx_queues) >> 32); +} +EXPORT_SYMBOL(__skb_tx_hash); + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + pr_warning("%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} - hash = jhash_1word(hash, skb_tx_hashrnd); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); - return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); + return queue_index; +#else + return -1; +#endif } -EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { + int queue_index; const struct net_device_ops *ops = dev->netdev_ops; - u16 queue_index = 0; - if (ops->ndo_select_queue) + if (dev->real_num_tx_queues == 1) + queue_index = 0; + else if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb); - else if (dev->real_num_tx_queues > 1) - queue_index = skb_tx_hash(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + struct sock *sk = skb->sk; + queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int old_index = queue_index; + + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) + queue_index = skb_tx_hash(dev, skb); + + if (queue_index != old_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + } + } + } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, + struct netdev_queue *txq) +{ + spinlock_t *root_lock = qdisc_lock(q); + bool contended = qdisc_is_running(q); + int rc; + + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits __QDISC_STATE_RUNNING owner to get the lock more often + * and dequeue packets faster. + */ + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + kfree_skb(skb); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + qdisc_run_begin(q)) { + /* + * This is a work-conserving queue; there are no old skbs + * waiting to be sent out; and the qdisc is not running - + * xmit the skb directly. + */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); + + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_bstats_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } else + qdisc_run_end(q); + + rc = NET_XMIT_SUCCESS; + } else { + skb_dst_force(skb); + rc = qdisc_enqueue_root(skb, q); + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } + } + spin_unlock(root_lock); + if (unlikely(contended)) + spin_unlock(&q->busylock); + return rc; +} + +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1818,60 +2363,20 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; - /* GSO will handle the following emulations directly. */ - if (netif_needs_gso(dev, skb)) - goto gso; - - if (skb_has_frags(skb) && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. - */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* If packet is not checksummed and device does not support - * checksumming for this protocol, complete checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, skb->csum_start - - skb_headroom(skb)); - if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) - goto out_kfree_skb; - } - -gso: /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); txq = dev_pick_tx(dev, skb); - q = rcu_dereference(txq->qdisc); + q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif + trace_net_dev_queue(skb); if (q->enqueue) { - spinlock_t *root_lock = qdisc_lock(q); - - spin_lock(root_lock); - - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); - rc = NET_XMIT_DROP; - } else { - rc = qdisc_enqueue_root(skb, q); - qdisc_run(q); - } - spin_unlock(root_lock); - + rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } @@ -1892,11 +2397,16 @@ gso: if (txq->xmit_lock_owner != cpu) { + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { - rc = 0; - if (!dev_hard_start_xmit(skb, dev, txq)) { + __this_cpu_inc(xmit_recursion); + rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); + if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } @@ -1907,7 +2417,9 @@ gso: "queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, - * unfortunately */ + * unfortunately + */ +recursion_alert: if (net_ratelimit()) printk(KERN_CRIT "Dead loop on virtual device " "%s, fix it urgently!\n", dev->name); @@ -1917,13 +2429,13 @@ gso: rc = -ENETDOWN; rcu_read_unlock_bh(); -out_kfree_skb: kfree_skb(skb); return rc; out: rcu_read_unlock_bh(); return rc; } +EXPORT_SYMBOL(dev_queue_xmit); /*======================================================================= @@ -1931,11 +2443,267 @@ out: =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. + */ +__u32 __skb_get_rxhash(struct sk_buff *skb) +{ + int nhoff, hash = 0, poff; + struct ipv6hdr *ip6; + struct iphdr *ip; + u8 ip_proto; + u32 addr1, addr2, ihl; + union { + u32 v32; + u16 v16[2]; + } ports; + + nhoff = skb_network_offset(skb); + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) + goto done; + + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; + addr1 = (__force u32) ip->saddr; + addr2 = (__force u32) ip->daddr; + ihl = ip->ihl; + break; + case __constant_htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) + goto done; + + ip6 = (struct ipv6hdr *) (skb->data + nhoff); + ip_proto = ip6->nexthdr; + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; + ihl = (40 >> 2); + break; + default: + goto done; + } + + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + } + } + + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); + + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { + goto done; + } + + skb_reset_network_header(skb); + if (!skb_get_rxhash(skb)) + goto done; + + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[skb->rxhash & + sock_flow_table->mask]; + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) { + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) + rflow->last_qtail = per_cpu(softnet_data, + tcpu).input_queue_head; + } + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + + if (map) { + tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + return cpu; +} + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ + struct softnet_data *sd = data; + + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) +{ + struct softnet_data *sd; + unsigned long flags; + + sd = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + + rps_lock(sd); + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { +enqueue: + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); + local_irq_restore(flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } + goto enqueue; + } + + sd->dropped++; + rps_unlock(sd); + + local_irq_restore(flags); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; +} /** * netif_rx - post buffer to the network code @@ -1954,42 +2722,43 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; int netif_rx(struct sk_buff *skb) { - struct softnet_data *queue; - unsigned long flags; + int ret; /* if netpoll wants it, pretend we never saw it */ if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - queue = &__get_cpu_var(softnet_data); + trace_netif_rx(skb); +#ifdef CONFIG_RPS + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { -enqueue: - __skb_queue_tail(&queue->input_pkt_queue, skb); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } + preempt_disable(); + rcu_read_lock(); - napi_schedule(&queue->backlog); - goto enqueue; - } + cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + cpu = smp_processor_id(); - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - kfree_skb(skb); - return NET_RX_DROP; + rcu_read_unlock(); + preempt_enable(); + } +#else + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } +#endif + return ret; } +EXPORT_SYMBOL(netif_rx); int netif_rx_ni(struct sk_buff *skb) { @@ -2003,7 +2772,6 @@ int netif_rx_ni(struct sk_buff *skb) return err; } - EXPORT_SYMBOL(netif_rx_ni); static void net_tx_action(struct softirq_action *h) @@ -2023,6 +2791,7 @@ static void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(atomic_read(&skb->users)); + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } @@ -2033,6 +2802,7 @@ static void net_tx_action(struct softirq_action *h) local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { @@ -2062,72 +2832,12 @@ static void net_tx_action(struct softirq_action *h) } } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) - -#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ + (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; -EXPORT_SYMBOL(br_fdb_test_addr_hook); -#endif - -/* - * If bridge module is loaded call bridging hook. - * returns NULL if packet was consumed. - */ -struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, - struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL(br_handle_frame_hook); - -static inline struct sk_buff *handle_bridge(struct sk_buff *skb, - struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) -{ - struct net_bridge_port *port; - - if (skb->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference(skb->dev->br_port)) == NULL) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - return br_handle_frame_hook(port, skb); -} -#else -#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) -#endif - -#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); - -static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, - struct net_device *orig_dev) -{ - if (skb->dev->macvlan_port == NULL) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - return macvlan_handle_frame_hook(skb); -} -#else -#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif #ifdef CONFIG_NET_CLS_ACT @@ -2139,26 +2849,23 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, * the ingress scheduler, you just cant add policies on ingress. * */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { struct net_device *dev = skb->dev; u32 ttl = G_TC_RTTL(skb->tc_verd); - struct netdev_queue *rxq; int result = TC_ACT_OK; struct Qdisc *q; - if (MAX_RED_LOOP < ttl++) { - printk(KERN_WARNING - "Redir loop detected Dropping packet (%d->%d)\n", - skb->iif, dev->ifindex); + if (unlikely(MAX_RED_LOOP < ttl++)) { + if (net_ratelimit()) + pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", + skb->skb_iif, dev->ifindex); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->rx_queue; - q = rxq->qdisc; if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); @@ -2174,18 +2881,17 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->rx_queue.qdisc == &noop_qdisc) + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) goto out; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; - } else { - /* Huh? Why does turning on AF_PACKET affect this? */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); } - switch (ing_filter(skb)) { + switch (ing_filter(skb, rxq)) { case TC_ACT_SHOT: case TC_ACT_STOLEN: kfree_skb(skb); @@ -2198,80 +2904,145 @@ out: } #endif -/* - * netif_nit_deliver - deliver received packets to network taps - * @skb: buffer +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * @rx_handler_data: data pointer that is used by rx handler * - * This function is used to deliver incoming packets to network - * taps. It should be used when the normal netif_receive_skb path - * is bypassed, for example because of VLAN acceleration. + * Register a receive hander for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. */ -void netif_nit_deliver(struct sk_buff *skb) +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler, + void *rx_handler_data) { - struct packet_type *ptype; + ASSERT_RTNL(); - if (list_empty(&ptype_all)) - return; + if (dev->rx_handler) + return -EBUSY; - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; + rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); + rcu_assign_pointer(dev->rx_handler, rx_handler); - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) - deliver_skb(skb, ptype, skb->dev); - } - rcu_read_unlock(); + return 0; } +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from * - * This function may only be called from softirq context and interrupts - * should be enabled. + * Unregister a receive hander from a device. * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped + * The caller must hold the rtnl_mutex. */ -int netif_receive_skb(struct sk_buff *skb) +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + rcu_assign_pointer(dev->rx_handler, NULL); + rcu_assign_pointer(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, + struct net_device *master) +{ + if (skb->pkt_type == PACKET_HOST) { + u16 *dest = (u16 *) eth_hdr(skb)->h_dest; + + memcpy(dest, master->dev_addr, ETH_ALEN); + } +} + +/* On bonding slaves other than the currently active slave, suppress + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and + * ARP on active-backup slaves with arp_validate enabled. + */ +int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) +{ + struct net_device *dev = skb->dev; + + if (master->priv_flags & IFF_MASTER_ARPMON) + dev->last_rx = jiffies; + + if ((master->priv_flags & IFF_MASTER_ALB) && + (master->priv_flags & IFF_BRIDGE_PORT)) { + /* Do address unmangle. The local destination address + * will be always the one master has. Provides the right + * functionality in a bridge. + */ + skb_bond_set_mac_by_master(skb, master); + } + + if (dev->priv_flags & IFF_SLAVE_INACTIVE) { + if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && + skb->protocol == __cpu_to_be16(ETH_P_ARP)) + return 0; + + if (master->priv_flags & IFF_MASTER_ALB) { + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return 0; + } + if (master->priv_flags & IFF_MASTER_8023AD && + skb->protocol == __cpu_to_be16(ETH_P_SLOW)) + return 0; + + return 1; + } + return 0; +} +EXPORT_SYMBOL(__skb_bond_should_drop); + +static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; struct net_device *orig_dev; + struct net_device *master; struct net_device *null_or_orig; + struct net_device *orig_or_bond; int ret = NET_RX_DROP; __be16 type; - if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (!netdev_tstamp_prequeue) + net_timestamp_check(skb); + + trace_netif_receive_skb(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->tstamp.tv64) - net_timestamp(skb); - - if (!skb->iif) - skb->iif = skb->dev->ifindex; + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; + /* + * bonding note: skbs received on inactive slaves should only + * be delivered to pkt handlers that are exact matches. Also + * the deliver_no_wcard flag will be set. If packet handlers + * are sensitive to duplicate packets these skbs will need to + * be dropped at the handler. + */ null_or_orig = NULL; orig_dev = skb->dev; - if (orig_dev->master) { - if (skb_bond_should_drop(skb)) + master = ACCESS_ONCE(orig_dev->master); + if (skb->deliver_no_wcard) + null_or_orig = orig_dev; + else if (master) { + if (skb_bond_should_drop(skb, master)) { + skb->deliver_no_wcard = 1; null_or_orig = orig_dev; /* deliver only exact match */ - else - skb->dev = orig_dev->master; + } else + skb->dev = master; } - __get_cpu_var(netdev_rx_stat).total++; - + __this_cpu_inc(softnet_data.processed); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; @@ -2303,21 +3074,48 @@ int netif_receive_skb(struct sk_buff *skb) ncls: #endif - skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; - skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; + /* Handle special case of bridge or macvlan */ + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + skb = rx_handler(skb); + if (!skb) + goto out; + } - skb_orphan(skb); + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_hwaccel_do_receive(&skb)) { + ret = __netif_receive_skb(skb); + goto out; + } else if (unlikely(!skb)) + goto out; + } + + /* + * Make sure frames received on VLAN interfaces stacked on + * bonding interfaces still make their way to any base bonding + * device that may have registered for a specific ptype. The + * handler may have to adjust skb->dev and orig_dev. + */ + orig_or_bond = orig_dev; + if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && + (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { + orig_or_bond = vlan_dev_real_dev(skb->dev); + } type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { + if (ptype->type == type && (ptype->dev == null_or_orig || + ptype->dev == skb->dev || ptype->dev == orig_dev || + ptype->dev == orig_or_bond)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -2327,6 +3125,7 @@ ncls: if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -2339,18 +3138,80 @@ out: return ret; } -/* Network device is going away, flush any packets still pending */ +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + +#ifdef CONFIG_RPS + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } + + return ret; + } +#else + return __netif_receive_skb(skb); +#endif +} +EXPORT_SYMBOL(netif_receive_skb); + +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ static void flush_backlog(void *arg) { struct net_device *dev = arg; - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; - skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->input_pkt_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev == dev) { - __skb_unlink(skb, &queue->input_pkt_queue); + __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); + input_queue_head_incr(sd); } + } } static int napi_gro_complete(struct sk_buff *skb) @@ -2385,7 +3246,7 @@ out: return netif_receive_skb(skb); } -void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -2400,7 +3261,7 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; @@ -2408,12 +3269,12 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int same_flow; int mac_len; - int ret; + enum gro_result ret; - if (!(skb->dev->features & NETIF_F_GRO)) + if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); @@ -2479,7 +3340,7 @@ pull: put_page(skb_shinfo(skb)->frags[0].page); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags); + --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); } } @@ -2492,41 +3353,44 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static inline gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; - if (netpoll_rx_on(skb)) - return GRO_NORMAL; - for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) - && !compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } return dev_gro_receive(napi, skb); } -int napi_skb_finish(int ret, struct sk_buff *skb) +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: kfree_skb(skb); break; + + case GRO_HELD: + case GRO_MERGED: + break; } - return err; + return ret; } EXPORT_SYMBOL(napi_skb_finish); @@ -2546,7 +3410,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) } EXPORT_SYMBOL(skb_gro_reset_offset); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); @@ -2554,60 +3418,52 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; napi->skb = skb; } -EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { - struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); - if (!skb) - goto out; - - skb_reserve(skb, NET_IP_ALIGN); - - napi->skb = skb; + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; } - -out: return skb; } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: case GRO_HELD: - skb->protocol = eth_type_trans(skb, napi->dev); - - if (ret == GRO_NORMAL) - return netif_receive_skb(skb); + skb->protocol = eth_type_trans(skb, skb->dev); - skb_gro_pull(skb, -ETH_HLEN); + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + + case GRO_MERGED: + break; } - return err; + return ret; } EXPORT_SYMBOL(napi_frags_finish); @@ -2648,38 +3504,98 @@ out: } EXPORT_SYMBOL(napi_frags_skb); -int napi_gro_frags(struct napi_struct *napi) +gro_result_t napi_gro_frags(struct napi_struct *napi) { struct sk_buff *skb = napi_frags_skb(napi); if (!skb) - return NET_RX_DROP; + return GRO_DROP; return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; + } + } else +#endif + local_irq_enable(); +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif napi->weight = weight_p; - do { + local_irq_disable(); + while (work < quota) { struct sk_buff *skb; + unsigned int qlen; - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) { - __napi_complete(napi); + while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); - break; + __netif_receive_skb(skb); + local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } } - local_irq_enable(); - netif_receive_skb(skb); - } while (++work < quota && jiffies == start_time); + rps_lock(sd); + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + + if (qlen < quota - work) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set on backlog. + * we can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + + quota = work + qlen; + } + rps_unlock(sd); + } + local_irq_enable(); return work; } @@ -2695,8 +3611,7 @@ void __napi_schedule(struct napi_struct *n) unsigned long flags; local_irq_save(flags); - list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + ____napi_schedule(&__get_cpu_var(softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); @@ -2767,17 +3682,16 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); - static void net_rx_action(struct softirq_action *h) { - struct list_head *list = &__get_cpu_var(softnet_data).poll_list; + struct softnet_data *sd = &__get_cpu_var(softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; local_irq_disable(); - while (!list_empty(list)) { + while (!list_empty(&sd->poll_list)) { struct napi_struct *n; int work, weight; @@ -2795,7 +3709,7 @@ static void net_rx_action(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - n = list_entry(list->next, struct napi_struct, poll_list); + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); @@ -2825,16 +3739,18 @@ static void net_rx_action(struct softirq_action *h) * move the instance around on the list at-will. */ if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) - __napi_complete(n); - else - list_move_tail(&n->poll_list, list); + if (unlikely(napi_disable_pending(n))) { + local_irq_enable(); + napi_complete(n); + local_irq_disable(); + } else + list_move_tail(&n->poll_list, &sd->poll_list); } netpoll_poll_unlock(have); } out: - local_irq_enable(); + net_rps_action_and_irq_enable(sd); #ifdef CONFIG_NET_DMA /* @@ -2847,12 +3763,12 @@ out: return; softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; + sd->time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } -static gifconf_func_t * gifconf_list [NPROTO]; +static gifconf_func_t *gifconf_list[NPROTO]; /** * register_gifconf - register a SIOCGIF handler @@ -2863,13 +3779,14 @@ static gifconf_func_t * gifconf_list [NPROTO]; * that is passed must not be freed or reused until it has been replaced * by another handler. */ -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) { if (family >= NPROTO) return -EINVAL; gifconf_list[family] = gifconf; return 0; } +EXPORT_SYMBOL(register_gifconf); /* @@ -2895,15 +3812,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifr.ifr_ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); if (!dev) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); return -ENODEV; } strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; @@ -2973,18 +3890,18 @@ static int dev_ifconf(struct net *net, char __user *arg) * in detail. */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; - read_lock(&dev_base_lock); + rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (off++ == *pos) return dev; @@ -2993,24 +3910,27 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net *net = seq_file_net(seq); + struct net_device *dev = (v == SEQ_START_TOKEN) ? + first_net_device(seq_file_net(seq)) : + next_net_device((struct net_device *)v); + ++*pos; - return v == SEQ_START_TOKEN ? - first_net_device(net) : next_net_device((struct net_device *)v); + return rcu_dereference(dev); } void dev_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { - const struct net_device_stats *stats = dev_get_stats(dev); + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -3045,17 +3965,17 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } -static struct netif_rx_stats *softnet_get_online(loff_t *pos) +static struct softnet_data *softnet_get_online(loff_t *pos) { - struct netif_rx_stats *rc = NULL; + struct softnet_data *sd = NULL; while (*pos < nr_cpu_ids) if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); + sd = &per_cpu(softnet_data, *pos); break; } else ++*pos; - return rc; + return sd; } static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) @@ -3075,12 +3995,12 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { - struct netif_rx_stats *s = v; + struct softnet_data *sd = v; - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, 0, + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision ); + sd->cpu_collision, sd->received_rps); return 0; } @@ -3303,11 +4223,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) slave->master = master; - synchronize_net(); - - if (old) + if (old) { + synchronize_net(); dev_put(old); - + } if (master) slave->flags |= IFF_SLAVE; else @@ -3316,6 +4235,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); return 0; } +EXPORT_SYMBOL(netdev_set_master); static void dev_change_rx_flags(struct net_device *dev, int flags) { @@ -3394,6 +4314,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) dev_set_rx_mode(dev); return err; } +EXPORT_SYMBOL(dev_set_promiscuity); /** * dev_set_allmulti - update allmulti count on a device @@ -3437,6 +4358,7 @@ int dev_set_allmulti(struct net_device *dev, int inc) } return 0; } +EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and @@ -3461,10 +4383,10 @@ void __dev_set_rx_mode(struct net_device *dev) /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ - if (dev->uc.count > 0 && !dev->uc_promisc) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); dev->uc_promisc = 1; - } else if (dev->uc.count == 0 && dev->uc_promisc) { + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); dev->uc_promisc = 0; } @@ -3481,555 +4403,6 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } -/* hw addresses list handling functions */ - -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - int alloc_size; - - if (addr_len > MAX_ADDR_LEN) - return -EINVAL; - - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - ha->type == addr_type) { - ha->refcount++; - return 0; - } - } - - - alloc_size = sizeof(*ha); - if (alloc_size < L1_CACHE_BYTES) - alloc_size = L1_CACHE_BYTES; - ha = kmalloc(alloc_size, GFP_ATOMIC); - if (!ha) - return -ENOMEM; - memcpy(ha->addr, addr, addr_len); - ha->type = addr_type; - ha->refcount = 1; - ha->synced = false; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; -} - -static void ha_rcu_free(struct rcu_head *head) -{ - struct netdev_hw_addr *ha; - - ha = container_of(head, struct netdev_hw_addr, rcu_head); - kfree(ha); -} - -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) { - if (--ha->refcount) - return 0; - list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); - list->count--; - return 0; - } - } - return -ENOENT; -} - -static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} - -static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, - unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, addr_type); - } -} - -static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) -{ - int err = 0; - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (!ha->synced) { - err = __hw_addr_add(to_list, ha->addr, - addr_len, ha->type); - if (err) - break; - ha->synced = true; - ha->refcount++; - } else if (ha->refcount == 1) { - __hw_addr_del(to_list, ha->addr, addr_len, ha->type); - __hw_addr_del(from_list, ha->addr, addr_len, ha->type); - } - } - return err; -} - -static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) -{ - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &from_list->list, list) { - if (ha->synced) { - __hw_addr_del(to_list, ha->addr, - addr_len, ha->type); - ha->synced = false; - __hw_addr_del(from_list, ha->addr, - addr_len, ha->type); - } - } -} - -static void __hw_addr_flush(struct netdev_hw_addr_list *list) -{ - struct netdev_hw_addr *ha, *tmp; - - list_for_each_entry_safe(ha, tmp, &list->list, list) { - list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); - } - list->count = 0; -} - -static void __hw_addr_init(struct netdev_hw_addr_list *list) -{ - INIT_LIST_HEAD(&list->list); - list->count = 0; -} - -/* Device addresses handling functions */ - -static void dev_addr_flush(struct net_device *dev) -{ - /* rtnl_mutex must be held here */ - - __hw_addr_flush(&dev->dev_addrs); - dev->dev_addr = NULL; -} - -static int dev_addr_init(struct net_device *dev) -{ - unsigned char addr[MAX_ADDR_LEN]; - struct netdev_hw_addr *ha; - int err; - - /* rtnl_mutex must be held here */ - - __hw_addr_init(&dev->dev_addrs); - memset(addr, 0, sizeof(addr)); - err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), - NETDEV_HW_ADDR_T_LAN); - if (!err) { - /* - * Get the first (previously created) address from the list - * and set dev_addr pointer to this location. - */ - ha = list_first_entry(&dev->dev_addrs.list, - struct netdev_hw_addr, list); - dev->dev_addr = ha->addr; - } - return err; -} - -/** - * dev_addr_add - Add a device address - * @dev: device - * @addr: address to add - * @addr_type: address type - * - * Add a device address to the device or increase the reference count if - * it already exists. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add); - -/** - * dev_addr_del - Release a device address. - * @dev: device - * @addr: address to delete - * @addr_type: address type - * - * Release reference to a device address and remove it from the device - * if the reference count drops to zero. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, - unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha; - - ASSERT_RTNL(); - - /* - * We can not remove the first address from the list because - * dev->dev_addr points to that. - */ - ha = list_first_entry(&dev->dev_addrs.list, - struct netdev_hw_addr, list); - if (ha->addr == dev->dev_addr && ha->refcount == 1) - return -ENOENT; - - err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, - addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} -EXPORT_SYMBOL(dev_addr_del); - -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device by which addresses the addresses will be deleted - * @addr_type: address type - 0 means type will used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - -/* multicast addresses handling functions */ - -int __dev_addr_delete(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (; (da = *list) != NULL; list = &da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - alen == da->da_addrlen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 0; - if (old_glbl == 0) - break; - } - if (--da->da_users) - return 0; - - *list = da->next; - kfree(da); - (*count)--; - return 0; - } - } - return -ENOENT; -} - -int __dev_addr_add(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (da = *list; da != NULL; da = da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - da->da_addrlen == alen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 1; - if (old_glbl) - return 0; - } - da->da_users++; - return 0; - } - } - - da = kzalloc(sizeof(*da), GFP_ATOMIC); - if (da == NULL) - return -ENOMEM; - memcpy(da->da_addr, addr, alen); - da->da_addrlen = alen; - da->da_users = 1; - da->da_gusers = glbl ? 1 : 0; - da->next = *list; - *list = da; - (*count)++; - return 0; -} - -/** - * dev_unicast_delete - Release secondary unicast address. - * @dev: device - * @addr: address to delete - * - * Release reference to a secondary unicast address and remove it - * from the device if the reference count drops to zero. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_delete(struct net_device *dev, void *addr) -{ - int err; - - ASSERT_RTNL(); - - err = __hw_addr_del(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_delete); - -/** - * dev_unicast_add - add a secondary unicast address - * @dev: device - * @addr: address to add - * - * Add a secondary unicast address to the device or increase - * the reference count if it already exists. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_add(struct net_device *dev, void *addr) -{ - int err; - - ASSERT_RTNL(); - - err = __hw_addr_add(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_add); - -int __dev_addr_sync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - int err = 0; - - da = *from; - while (da != NULL) { - next = da->next; - if (!da->da_synced) { - err = __dev_addr_add(to, to_count, - da->da_addr, da->da_addrlen, 0); - if (err < 0) - break; - da->da_synced = 1; - da->da_users++; - } else if (da->da_users == 1) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } - return err; -} - -void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - - da = *from; - while (da != NULL) { - next = da->next; - if (da->da_synced) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - da->da_synced = 0; - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } -} - -/** - * dev_unicast_sync - Synchronize device's unicast list to another device - * @to: destination device - * @from: source device - * - * Add newly added addresses to the destination device and release - * addresses that have no users left. - * - * This function is intended to be called from the dev->set_rx_mode - * function of layered software devices. - */ -int dev_unicast_sync(struct net_device *to, struct net_device *from) -{ - int err = 0; - - ASSERT_RTNL(); - - if (to->addr_len != from->addr_len) - return -EINVAL; - - err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - return err; -} -EXPORT_SYMBOL(dev_unicast_sync); - -/** - * dev_unicast_unsync - Remove synchronized addresses from the destination device - * @to: destination device - * @from: source device - * - * Remove all addresses that were added to the destination device by - * dev_unicast_sync(). This function is intended to be called from the - * dev->stop function of layered software devices. - */ -void dev_unicast_unsync(struct net_device *to, struct net_device *from) -{ - ASSERT_RTNL(); - - if (to->addr_len != from->addr_len) - return; - - __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); - __dev_set_rx_mode(to); -} -EXPORT_SYMBOL(dev_unicast_unsync); - -static void dev_unicast_flush(struct net_device *dev) -{ - /* rtnl_mutex must be held here */ - - __hw_addr_flush(&dev->uc); -} - -static void dev_unicast_init(struct net_device *dev) -{ - /* rtnl_mutex must be held here */ - - __hw_addr_init(&dev->uc); -} - - -static void __dev_addr_discard(struct dev_addr_list **list) -{ - struct dev_addr_list *tmp; - - while (*list != NULL) { - tmp = *list; - *list = tmp->next; - if (tmp->da_users > tmp->da_gusers) - printk("__dev_addr_discard: address leakage! " - "da_users=%d\n", tmp->da_users); - kfree(tmp); - } -} - -static void dev_addr_discard(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - - __dev_addr_discard(&dev->mc_list); - dev->mc_count = 0; - - netif_addr_unlock_bh(dev); -} - /** * dev_get_flags - get flags reported to userspace * @dev: device @@ -4059,19 +4432,12 @@ unsigned dev_get_flags(const struct net_device *dev) return flags; } +EXPORT_SYMBOL(dev_get_flags); -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; int old_flags = dev->flags; + int ret; ASSERT_RTNL(); @@ -4102,19 +4468,15 @@ int dev_change_flags(struct net_device *dev, unsigned flags) ret = 0; if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); if (!ret) dev_set_rx_mode(dev); } - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); - if ((flags ^ dev->gflags) & IFF_PROMISC) { - int inc = (flags & IFF_PROMISC) ? +1 : -1; + int inc = (flags & IFF_PROMISC) ? 1 : -1; + dev->gflags ^= IFF_PROMISC; dev_set_promiscuity(dev, inc); } @@ -4124,18 +4486,56 @@ int dev_change_flags(struct net_device *dev, unsigned flags) IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { - int inc = (flags & IFF_ALLMULTI) ? +1 : -1; + int inc = (flags & IFF_ALLMULTI) ? 1 : -1; + dev->gflags ^= IFF_ALLMULTI; dev_set_allmulti(dev, inc); } - /* Exclude state transition flags, already notified */ - changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret, changes; + int old_flags = dev->flags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + + changes = old_flags ^ dev->flags; if (changes) rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + __dev_notify_flags(dev, old_flags); return ret; } +EXPORT_SYMBOL(dev_change_flags); /** * dev_set_mtu - Change maximum transfer unit @@ -4169,6 +4569,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); return err; } +EXPORT_SYMBOL(dev_set_mtu); /** * dev_set_mac_address - Change Media Access Control Address @@ -4193,69 +4594,70 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } +EXPORT_SYMBOL(dev_set_mac_address); /* - * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); - return 0; + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; - case SIOCGIFSLAVE: - err = -EINVAL; - break; + case SIOCGIFSLAVE: + err = -EINVAL; + break; - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; - default: - /* dev_ioctl() should ensure this case - * is never reached - */ - WARN_ON(1); - err = -EINVAL; - break; + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -EINVAL; + break; } return err; @@ -4276,92 +4678,89 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ops = dev->netdev_ops; switch (cmd) { - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return 0; + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; - case SIOCADDMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; + case SIOCSIFMAP: + if (ops->ndo_set_config) { if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; - case SIOCDELMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); + case SIOCADDMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; + case SIOCDELMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; - /* - * Unknown or private ioctl - */ + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; + /* + * Unknown or private ioctl + */ + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; } return err; @@ -4418,135 +4817,135 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); - read_lock(&dev_base_lock); - ret = dev_ifsioc_locked(net, &ifr, cmd); - read_unlock(&dev_base_lock); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, &ifr, cmd); + rcu_read_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; - case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - case SIOCSHWTSTAMP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(net, &ifr, cmd); rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -EINVAL; - - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); - return -EINVAL; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -EINVAL; } } @@ -4578,74 +4977,88 @@ static void net_set_todo(struct net_device *dev) list_add_tail(&dev->todo_list, &net_todo_list); } -static void rollback_registered(struct net_device *dev) +static void rollback_registered_many(struct list_head *head) { + struct net_device *dev, *tmp; + BUG_ON(dev_boot_phase); ASSERT_RTNL(); - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); - WARN_ON(1); - return; - } + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } - BUG_ON(dev->reg_state != NETREG_REGISTERED); + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } /* If device is running, close it first. */ - dev_close(dev); + dev_close_many(head); - /* And unlink it from device chain. */ - unlist_netdevice(dev); + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); - dev->reg_state = NETREG_UNREGISTERING; + dev->reg_state = NETREG_UNREGISTERING; + } synchronize_net(); - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* - * Flush the unicast and multicast chains - */ - dev_unicast_flush(dev); - dev_addr_discard(dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); - synchronize_net(); + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } - dev_put(dev); -} + /* Process any work delayed until the end of the batch */ + dev = list_first_entry(head, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -static void __netdev_init_queue_locks_one(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - spin_lock_init(&dev_queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; + rcu_barrier(); + + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); } -static void netdev_init_queue_locks(struct net_device *dev) +static void rollback_registered(struct net_device *dev) { - netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -4668,10 +5081,13 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } if (features & NETIF_F_UFO) { - if (!(features & NETIF_F_GEN_CSUM)) { + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { if (name) printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_HW_CSUM feature.\n", + "since no checksum offload features.\n", name); features &= ~NETIF_F_UFO; } @@ -4689,6 +5105,86 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) EXPORT_SYMBOL(netdev_fix_features); /** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +#ifdef CONFIG_RPS +static int netif_alloc_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + BUG_ON(count < 1); + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", count); + return -ENOMEM; + } + dev->_rx = rx; + + for (i = 0; i < count; i++) + rx[i].dev = dev; + return 0; +} +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues.\n", + count); + return -ENOMEM; + } + dev->_tx = tx; + + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); + + return 0; +} + +/** * register_netdevice - register a network device * @dev: device to register * @@ -4707,8 +5203,6 @@ EXPORT_SYMBOL(netdev_fix_features); int register_netdevice(struct net_device *dev) { - struct hlist_head *head; - struct hlist_node *p; int ret; struct net *net = dev_net(dev); @@ -4723,7 +5217,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - netdev_init_queue_locks(dev); dev->iflink = -1; @@ -4737,26 +5230,14 @@ int register_netdevice(struct net_device *dev) } } - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; + ret = dev_get_valid_name(dev, dev->name, 0); + if (ret) goto err_uninit; - } dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Check for existence of name */ - head = dev_name_hash(net, dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto err_uninit; - } - } - /* Fix illegal checksum combinations */ if ((dev->features & NETIF_F_HW_CSUM) && (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { @@ -4778,7 +5259,17 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - netdev_initialize_kobject(dev); + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, + * vlan_dev_init() will do the dev->features check, so these features + * are enabled only if supported by underlying device. + */ + dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + ret = netdev_register_kobject(dev); if (ret) goto err_uninit; @@ -4802,6 +5293,13 @@ int register_netdevice(struct net_device *dev) rollback_registered(dev); dev->reg_state = NETREG_UNREGISTERED; } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); out: return ret; @@ -4811,6 +5309,7 @@ err_uninit: dev->netdev_ops->ndo_uninit(dev); goto out; } +EXPORT_SYMBOL(register_netdevice); /** * init_dummy_netdev - init a dummy network device for NAPI @@ -4836,9 +5335,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -4846,6 +5342,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -4887,6 +5388,16 @@ out: } EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -4901,14 +5412,21 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; + + linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users + * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -4928,11 +5446,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -4973,7 +5493,7 @@ void netdev_run_todo(void) while (!list_empty(&list)) { struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); + = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { @@ -4990,9 +5510,9 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); - WARN_ON(dev->ip_ptr); - WARN_ON(dev->ip6_ptr); + BUG_ON(netdev_refcnt_read(dev)); + WARN_ON(rcu_dereference_raw(dev->ip_ptr)); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) @@ -5004,76 +5524,137 @@ void netdev_run_todo(void) } /** + * dev_txq_stats_fold - fold tx_queues stats + * @dev: device to get statistics from + * @stats: struct rtnl_link_stats64 to hold results + */ +void dev_txq_stats_fold(const struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + spin_lock_bh(&txq->_xmit_lock); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + spin_unlock_bh(&txq->_xmit_lock); + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } +} +EXPORT_SYMBOL(dev_txq_stats_fold); + +/* Convert net_device_stats to rtnl_link_stats64. They have the same + * fields in the same order, with only the type differing. + */ +static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 + BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else + size_t i, n = sizeof(*stats64) / sizeof(u64); + const unsigned long *src = (const unsigned long *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != + sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = src[i]; +#endif +} + +/** * dev_get_stats - get network device statistics * @dev: device to get statistics from + * @storage: place to store stats * - * Get network statistics from device. The device driver may provide - * its own method by setting dev->netdev_ops->get_stats; otherwise - * the internal statistics structure is used. + * Get network statistics from device. Return @storage. + * The device driver may provide its own method by setting + * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + * otherwise the internal statistics structure is used. */ -const struct net_device_stats *dev_get_stats(struct net_device *dev) +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_get_stats) - return ops->ndo_get_stats(dev); - else { - unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - struct net_device_stats *stats = &dev->stats; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } - return stats; + if (ops->ndo_get_stats64) { + memset(storage, 0, sizeof(*storage)); + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { + netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else { + netdev_stats_to_stats64(storage, &dev->stats); + dev_txq_stats_fold(dev, storage); } + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + return storage; } EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { - queue->dev = dev; -} + struct netdev_queue *queue = dev_ingress_queue(dev); -static void netdev_init_queues(struct net_device *dev) -{ - netdev_init_one_queue(dev, &dev->rx_queue, NULL); - netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); - spin_lock_init(&dev->tx_global_lock); +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; } /** - * alloc_netdev_mq - allocate network device + * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device - * @queue_count: the number of subqueues to allocate + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subquue structs - * for each queue on the device at the end of the netdevice. + * for each queue on the device. */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, - void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) { - struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); + if (txqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero queues.\n"); + return NULL; + } + +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero RX queues.\n"); + return NULL; + } +#endif + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ @@ -5089,45 +5670,57 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return NULL; } - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); - goto free_p; - } - dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + goto free_p; + if (dev_addr_init(dev)) - goto free_tx; + goto free_pcpu; - dev_unicast_init(dev); + dev_mc_init(dev); + dev_uc_init(dev); dev_net_set(dev, &init_net); - dev->_tx = tx; - dev->num_tx_queues = queue_count; - dev->real_num_tx_queues = queue_count; + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_pcpu; - dev->gso_max_size = GSO_MAX_SIZE; +#ifdef CONFIG_RPS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_pcpu; +#endif - netdev_init_queues(dev); + dev->gso_max_size = GSO_MAX_SIZE; + INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); + dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); return dev; -free_tx: - kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif free_p: kfree(p); return NULL; } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device @@ -5144,13 +5737,24 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + + kfree(rcu_dereference_raw(dev->ingress_queue)); /* Flush device addresses */ dev_addr_flush(dev); + /* Clear ethtool n-tuple list */ + ethtool_ntuple_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); @@ -5163,6 +5767,7 @@ void free_netdev(struct net_device *dev) /* will free via device release */ put_device(&dev->dev); } +EXPORT_SYMBOL(free_netdev); /** * synchronize_net - Synchronize with packet receive processing @@ -5175,26 +5780,50 @@ void synchronize_net(void) might_sleep(); synchronize_rcu(); } +EXPORT_SYMBOL(synchronize_net); /** - * unregister_netdevice - remove device from the kernel + * unregister_netdevice_queue - remove device from the kernel * @dev: device + * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ -void unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } } +EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel @@ -5213,7 +5842,6 @@ void unregister_netdev(struct net_device *dev) unregister_netdevice(dev); rtnl_unlock(); } - EXPORT_SYMBOL(unregister_netdev); /** @@ -5232,8 +5860,6 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - char buf[IFNAMSIZ]; - const char *destname; int err; ASSERT_RTNL(); @@ -5243,15 +5869,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char if (dev->features & NETIF_F_NETNS_LOCAL) goto out; -#ifdef CONFIG_SYSFS - /* Don't allow real devices to be moved when sysfs - * is enabled. - */ - err = -EINVAL; - if (dev->dev.parent) - goto out; -#endif - /* Ensure the device has been registrered */ err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) @@ -5266,20 +5883,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * we can use it in the destination network namespace. */ err = -EEXIST; - destname = dev->name; - if (__dev_get_by_name(net, destname)) { + if (__dev_get_by_name(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; - if (!dev_valid_name(pat)) - goto out; - if (strchr(pat, '%')) { - if (__dev_alloc_name(net, pat, buf) < 0) - goto out; - destname = buf; - } else - destname = pat; - if (__dev_get_by_name(net, destname)) + if (dev_get_valid_name(dev, pat, 1)) goto out; } @@ -5301,24 +5909,23 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that we are about to destroy this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); /* * Flush the unicast and multicast chains */ - dev_unicast_flush(dev); - dev_addr_discard(dev); - - netdev_unregister_kobject(dev); + dev_uc_flush(dev); + dev_mc_flush(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); - /* Assign the new device name */ - if (destname != dev->name) - strcpy(dev->name, destname); - /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) { int iflink = (dev->iflink == dev->ifindex); @@ -5328,7 +5935,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char } /* Fixup kobjects */ - err = netdev_register_kobject(dev); + err = device_rename(&dev->dev, dev->name); WARN_ON(err); /* Add the device back in the hashes */ @@ -5337,18 +5944,24 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + synchronize_net(); err = 0; out: return err; } +EXPORT_SYMBOL_GPL(dev_change_net_namespace); static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) { struct sk_buff **list_skb; - struct Qdisc **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; @@ -5369,20 +5982,26 @@ static int dev_cpu_callback(struct notifier_block *nfb, *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + while ((skb = __skb_dequeue(&oldsd->process_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); + input_queue_head_incr(oldsd); + } return NOTIFY_OK; } @@ -5402,7 +6021,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, unsigned long mask) { /* If device needs checksumming, downgrade to it. */ - if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) + if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); else if (mask & NETIF_F_ALL_CSUM) { /* If one device supports v4/v6 checksumming, set for all. */ @@ -5422,7 +6041,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; all |= one & mask & NETIF_F_ONE_FOR_ALL; return all; @@ -5491,6 +6110,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len) return buffer; } +static int __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) +{ + int r; + + if (dev && dev->dev.parent) + r = dev_printk(level, dev->dev.parent, "%s: %pV", + netdev_name(dev), vaf); + else if (dev) + r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + else + r = printk("%s(NULL net_device): %pV", level, vaf); + + return r; +} + +int netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + r = __netdev_printk(level, dev, &vaf); + va_end(args); + + return r; +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level) \ +int func(const struct net_device *dev, const char *fmt, ...) \ +{ \ + int r; \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __netdev_printk(level, dev, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); + static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); @@ -5504,14 +6185,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit(struct net *net) { - struct net_device *dev; + struct net_device *dev, *aux; /* - * Push all migratable of the network devices back to the + * Push all migratable network devices back to the * initial network namespace */ rtnl_lock(); -restart: - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -5519,11 +6199,9 @@ restart: if (dev->features & NETIF_F_NETNS_LOCAL) continue; - /* Delete virtual devices */ - if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev); - goto restart; - } + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; /* Push remaing network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); @@ -5533,13 +6211,37 @@ restart: __func__, dev->name, err); BUG(); } - goto restart; } rtnl_unlock(); } +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, + .exit_batch = default_device_exit_batch, }; /* @@ -5577,17 +6279,26 @@ static int __init net_dev_init(void) */ for_each_possible_cpu(i) { - struct softnet_data *queue; + struct softnet_data *sd = &per_cpu(softnet_data, i); - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); + memset(sd, 0, sizeof(*sd)); + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + sd->completion_queue = NULL; + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->csd.flags = 0; + sd->cpu = i; +#endif - queue->backlog.poll = process_backlog; - queue->backlog.weight = weight_p; - queue->backlog.gro_list = NULL; - queue->backlog.gro_count = 0; + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + sd->backlog.gro_list = NULL; + sd->backlog.gro_count = 0; } dev_boot_phase = 0; @@ -5622,47 +6333,9 @@ subsys_initcall(net_dev_init); static int __init initialize_hashrnd(void) { - get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + get_random_bytes(&hashrnd, sizeof(hashrnd)); return 0; } late_initcall_sync(initialize_hashrnd); -EXPORT_SYMBOL(__dev_get_by_index); -EXPORT_SYMBOL(__dev_get_by_name); -EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(dev_valid_name); -EXPORT_SYMBOL(dev_add_pack); -EXPORT_SYMBOL(dev_alloc_name); -EXPORT_SYMBOL(dev_close); -EXPORT_SYMBOL(dev_get_by_flags); -EXPORT_SYMBOL(dev_get_by_index); -EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_open); -EXPORT_SYMBOL(dev_queue_xmit); -EXPORT_SYMBOL(dev_remove_pack); -EXPORT_SYMBOL(dev_set_allmulti); -EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_change_flags); -EXPORT_SYMBOL(dev_set_mtu); -EXPORT_SYMBOL(dev_set_mac_address); -EXPORT_SYMBOL(free_netdev); -EXPORT_SYMBOL(netdev_boot_setup_check); -EXPORT_SYMBOL(netdev_set_master); -EXPORT_SYMBOL(netdev_state_change); -EXPORT_SYMBOL(netif_receive_skb); -EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(register_gifconf); -EXPORT_SYMBOL(register_netdevice); -EXPORT_SYMBOL(register_netdevice_notifier); -EXPORT_SYMBOL(skb_checksum_help); -EXPORT_SYMBOL(synchronize_net); -EXPORT_SYMBOL(unregister_netdevice); -EXPORT_SYMBOL(unregister_netdevice_notifier); -EXPORT_SYMBOL(net_enable_timestamp); -EXPORT_SYMBOL(net_disable_timestamp); -EXPORT_SYMBOL(dev_get_flags); - -EXPORT_SYMBOL(dev_load); - -EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c new file mode 100644 index 00000000000..508f9c18992 --- /dev/null +++ b/net/core/dev_addr_lists.c @@ -0,0 +1,741 @@ +/* + * net/core/dev_addr_lists.c - Functions for handling net device lists + * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> + * + * This file contains functions for working with unicast, multicast and device + * addresses lists. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/list.h> +#include <linux/proc_fs.h> + +/* + * General list handling functions + */ + +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + if (global) { + /* check if addr is already used as global */ + if (ha->global_use) + return 0; + else + ha->global_use = true; + } + ha->refcount++; + return 0; + } + } + + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + ha->refcount = 1; + ha->global_use = global; + ha->synced = false; + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + return 0; +} + +static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); +} + +static void ha_rcu_free(struct rcu_head *head) +{ + struct netdev_hw_addr *ha; + + ha = container_of(head, struct netdev_hw_addr, rcu_head); + kfree(ha); +} + +static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + if (global) { + if (!ha->global_use) + break; + else + ha->global_use = false; + } + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + list->count--; + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); +} + +int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, &from_list->list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del(to_list, ha2->addr, addr_len, type); + } + return err; +} +EXPORT_SYMBOL(__hw_addr_add_multiple); + +void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del(to_list, ha->addr, addr_len, addr_type); + } +} +EXPORT_SYMBOL(__hw_addr_del_multiple); + +int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, ha->addr, addr_len, ha->type); + __hw_addr_del(from_list, ha->addr, addr_len, ha->type); + } + } + return err; +} +EXPORT_SYMBOL(__hw_addr_sync); + +void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->synced) { + __hw_addr_del(to_list, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, ha->addr, + addr_len, ha->type); + } + } +} +EXPORT_SYMBOL(__hw_addr_unsync); + +void __hw_addr_flush(struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + } + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_flush); + +void __hw_addr_init(struct netdev_hw_addr_list *list) +{ + INIT_LIST_HEAD(&list->list); + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_init); + +/* + * Device addresses handling functions + */ + +/** + * dev_addr_flush - Flush device address list + * @dev: device + * + * Flush device address list and reset ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addrs); + dev->dev_addr = NULL; +} +EXPORT_SYMBOL(dev_addr_flush); + +/** + * dev_addr_init - Init device address list + * @dev: device + * + * Init device address list and create the first element, + * used by ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + __hw_addr_init(&dev->dev_addrs); + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} +EXPORT_SYMBOL(dev_addr_init); + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha; + + ASSERT_RTNL(); + + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device by which addresses the addresses will be deleted + * @addr_type: address type - 0 means type will used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* + * Unicast list handling functions + */ + +/** + * dev_uc_add - Add a secondary unicast address + * @dev: device + * @addr: address to add + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + */ +int dev_uc_add(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add); + +/** + * dev_uc_del - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_uc_del(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_del); + +/** + * dev_uc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. + */ +int dev_uc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync); + +/** + * dev_uc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_uc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_uc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_uc_unsync); + +/** + * dev_uc_flush - Flush unicast addresses + * @dev: device + * + * Flush unicast addresses. + */ +void dev_uc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_uc_flush); + +/** + * dev_uc_flush - Init unicast address list + * @dev: device + * + * Init unicast address list. + */ +void dev_uc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->uc); +} +EXPORT_SYMBOL(dev_uc_init); + +/* + * Multicast list handling functions + */ + +static int __dev_mc_add(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +/** + * dev_mc_add - Add a multicast address + * @dev: device + * @addr: address to add + * + * Add a multicast address to the device or increase + * the reference count if it already exists. + */ +int dev_mc_add(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_add); + +/** + * dev_mc_add_global - Add a global multicast address + * @dev: device + * @addr: address to add + * + * Add a global multicast address to the device. + */ +int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_add_global); + +static int __dev_mc_del(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} + +/** + * dev_mc_del - Delete a multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_del); + +/** + * dev_mc_del_global - Delete a global multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_del_global); + +/** + * dev_mc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_multicast_list + * or dev->set_rx_mode function of layered software devices. + */ +int dev_mc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync); + +/** + * dev_mc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_mc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_mc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_mc_unsync); + +/** + * dev_mc_flush - Flush multicast addresses + * @dev: device + * + * Flush multicast addresses. + */ +void dev_mc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->mc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_mc_flush); + +/** + * dev_mc_flush - Init multicast address list + * @dev: device + * + * Init multicast address list. + */ +void dev_mc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->mc); +} +EXPORT_SYMBOL(dev_mc_init); + +#ifdef CONFIG_PROC_FS +#include <linux/seq_file.h> + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + proc_net_remove(net, "dev_mcast"); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +void __init dev_mcast_init(void) +{ + register_pernet_subsys(&dev_mc_net_ops); +} + diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c deleted file mode 100644 index 9e2fa39f22a..00000000000 --- a/net/core/dev_mcast.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Linux NET3: Multicast List maintenance. - * - * Authors: - * Tim Kordas <tjk@nostromo.eeap.cwru.edu> - * Richard Underwood <richard@wuzz.demon.co.uk> - * - * Stir fried together from the IP multicast and CAP patches above - * Alan Cox <alan@lxorguk.ukuu.org.uk> - * - * Fixes: - * Alan Cox : Update the device on a real delete - * rather than any time but... - * Alan Cox : IFF_ALLMULTI support. - * Alan Cox : New format set_multicast_list() calls. - * Gleb Natapov : Remove dev_mc_lock. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/init.h> -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/route.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/arp.h> - - -/* - * Device multicast list maintenance. - * - * This is used both by IP and by the user level maintenance functions. - * Unlike BSD we maintain a usage count on a given multicast address so - * that a casual user application can add/delete multicasts used by - * protocols without doing damage to the protocols when it deletes the - * entries. It also helps IP as it tracks overlapping maps. - * - * Device mc lists are changed by bh at least if IPv6 is enabled, - * so that it must be bh protected. - * - * We block accesses to device mc filters with netif_tx_lock. - */ - -/* - * Delete a device level multicast - */ - -int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) -{ - int err; - - netif_addr_lock_bh(dev); - err = __dev_addr_delete(&dev->mc_list, &dev->mc_count, - addr, alen, glbl); - if (!err) { - /* - * We have altered the list, so the card - * loaded filter is now wrong. Fix it - */ - - __dev_set_rx_mode(dev); - } - netif_addr_unlock_bh(dev); - return err; -} - -/* - * Add a device level multicast - */ - -int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) -{ - int err; - - netif_addr_lock_bh(dev); - err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} - -/** - * dev_mc_sync - Synchronize device's multicast list to another device - * @to: destination device - * @from: source device - * - * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. - * - * This function is intended to be called from the dev->set_multicast_list - * or dev->set_rx_mode function of layered software devices. - */ -int dev_mc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0; - - netif_addr_lock_bh(to); - err = __dev_addr_sync(&to->mc_list, &to->mc_count, - &from->mc_list, &from->mc_count); - if (!err) - __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); - - return err; -} -EXPORT_SYMBOL(dev_mc_sync); - - -/** - * dev_mc_unsync - Remove synchronized addresses from the destination - * device - * @to: destination device - * @from: source device - * - * Remove all addresses that were added to the destination device by - * dev_mc_sync(). This function is intended to be called from the - * dev->stop function of layered software devices. - */ -void dev_mc_unsync(struct net_device *to, struct net_device *from) -{ - netif_addr_lock_bh(from); - netif_addr_lock(to); - - __dev_addr_unsync(&to->mc_list, &to->mc_count, - &from->mc_list, &from->mc_count); - __dev_set_rx_mode(to); - - netif_addr_unlock(to); - netif_addr_unlock_bh(from); -} -EXPORT_SYMBOL(dev_mc_unsync); - -#ifdef CONFIG_PROC_FS -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ - struct dev_addr_list *m; - struct net_device *dev = v; - - if (v == SEQ_START_TOKEN) - return 0; - - netif_addr_lock_bh(dev); - for (m = dev->mc_list; m; m = m->next) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, m->dmi_users, m->dmi_gusers); - - for (i = 0; i < m->dmi_addrlen; i++) - seq_printf(seq, "%02x", m->dmi_addr[i]); - - seq_putc(seq, '\n'); - } - netif_addr_unlock_bh(dev); - return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, - .open = dev_mc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ - if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) - return -ENOMEM; - return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ - proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { - .init = dev_mc_net_init, - .exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ - register_pernet_subsys(&dev_mc_net_ops); -} - -EXPORT_SYMBOL(dev_mc_add); -EXPORT_SYMBOL(dev_mc_delete); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9d66fa953ab..36e603c78ce 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> +#include <linux/slab.h> #include <net/genetlink.h> #include <net/netevent.h> @@ -41,7 +42,7 @@ static void send_dm_alert(struct work_struct *unused); * netlink alerts */ static int trace_state = TRACE_OFF; -static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(trace_state_lock); struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -52,6 +53,7 @@ struct per_cpu_dm_data { struct dm_hw_stat_delta { struct net_device *dev; + unsigned long last_rx; struct list_head list; struct rcu_head rcu; unsigned long last_drop_val; @@ -170,27 +172,35 @@ out: return; } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) { trace_drop_common(skb, location); } -static void trace_napi_poll_hit(struct napi_struct *napi) +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) { struct dm_hw_stat_delta *new_stat; /* - * Ratelimit our check time to dm_hw_check_delta jiffies + * Don't check napi structures with no associated device */ - if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + if (!napi->dev) return; rcu_read_lock(); list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + /* + * only add a note to our monitor buffer if: + * 1) this is the dev we received on + * 2) its after the last_rx delta + * 3) our rx_dropped count has gone up + */ if ((new_stat->dev == napi->dev) && + (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { trace_drop_common(NULL, NULL); new_stat->last_drop_val = napi->dev->stats.rx_dropped; + new_stat->last_rx = jiffies; break; } } @@ -213,14 +223,19 @@ static int set_all_monitor_traces(int state) spin_lock(&trace_state_lock); + if (state == trace_state) { + rc = -EAGAIN; + goto out_unlock; + } + switch (state) { case TRACE_ON: - rc |= register_trace_kfree_skb(trace_kfree_skb_hit); - rc |= register_trace_napi_poll(trace_napi_poll_hit); + rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); break; case TRACE_OFF: - rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); - rc |= unregister_trace_napi_poll(trace_napi_poll_hit); + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); tracepoint_synchronize_unregister(); @@ -241,11 +256,12 @@ static int set_all_monitor_traces(int state) if (!rc) trace_state = state; + else + rc = -EINPROGRESS; +out_unlock: spin_unlock(&trace_state_lock); - if (rc) - return -EINPROGRESS; return rc; } @@ -286,7 +302,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, goto out; new_stat->dev = dev; - INIT_RCU_HEAD(&new_stat->rcu); + new_stat->last_rx = jiffies; spin_lock(&trace_state_lock); list_add_rcu(&new_stat->list, &hw_stats_list); spin_unlock(&trace_state_lock); @@ -331,9 +347,9 @@ static struct notifier_block dropmon_net_notifier = { static int __init init_net_drop_monitor(void) { - int cpu; - int rc, i, ret; struct per_cpu_dm_data *data; + int cpu, rc; + printk(KERN_INFO "Initalizing network drop monitor service\n"); if (sizeof(void *) > 8) { @@ -341,21 +357,12 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - if (genl_register_family(&net_drop_monitor_family) < 0) { + rc = genl_register_family_with_ops(&net_drop_monitor_family, + dropmon_ops, + ARRAY_SIZE(dropmon_ops)); + if (rc) { printk(KERN_ERR "Could not create drop monitor netlink family\n"); - return -EFAULT; - } - - rc = -EFAULT; - - for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) { - ret = genl_register_ops(&net_drop_monitor_family, - &dropmon_ops[i]); - if (ret) { - printk(KERN_CRIT "Failed to register operation %d\n", - dropmon_ops[i].cmd); - goto out_unreg; - } + return rc; } rc = register_netdevice_notifier(&dropmon_net_notifier); diff --git a/net/core/dst.c b/net/core/dst.c index 57bc4d5b8d0..b99c7c7ffce 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -12,11 +12,13 @@ #include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> +#include <linux/sched.h> #include <net/dst.h> @@ -42,7 +44,7 @@ static atomic_t dst_total = ATOMIC_INIT(0); */ static struct { spinlock_t lock; - struct dst_entry *list; + struct dst_entry *list; unsigned long timer_inc; unsigned long timer_expires; } dst_garbage = { @@ -50,7 +52,7 @@ static struct { .timer_inc = DST_GC_MAX, }; static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry * dst); +static void ___dst_free(struct dst_entry *dst); static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); @@ -79,6 +81,7 @@ loop: while ((dst = next) != NULL) { next = dst->next; prefetch(&next->next); + cond_resched(); if (likely(atomic_read(&dst->__refcnt))) { last->next = dst; last = dst; @@ -133,8 +136,8 @@ loop: } expires = dst_garbage.timer_expires; /* - * if the next desired timer is more than 4 seconds in the future - * then round the timer to whole seconds + * if the next desired timer is more than 4 seconds in the + * future then round the timer to whole seconds */ if (expires > 4*HZ) expires = round_jiffies_relative(expires); @@ -149,7 +152,8 @@ loop: " expires: %lu elapsed: %lu us\n", atomic_read(&dst_total), delayed, work_performed, expires, - elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); + elapsed.tv_sec * USEC_PER_SEC + + elapsed.tv_nsec / NSEC_PER_USEC); #endif } @@ -160,11 +164,11 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -void * dst_alloc(struct dst_ops * ops) +void *dst_alloc(struct dst_ops *ops) { - struct dst_entry * dst; + struct dst_entry *dst; - if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) return NULL; } @@ -179,22 +183,22 @@ void * dst_alloc(struct dst_ops * ops) #if RT_CACHE_DEBUG >= 2 atomic_inc(&dst_total); #endif - atomic_inc(&ops->entries); + dst_entries_add(ops, 1); return dst; } +EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry * dst) +static void ___dst_free(struct dst_entry *dst) { /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - } dst->obsolete = 2; } -void __dst_free(struct dst_entry * dst) +void __dst_free(struct dst_entry *dst) { spin_lock_bh(&dst_garbage.lock); ___dst_free(dst); @@ -208,6 +212,7 @@ void __dst_free(struct dst_entry * dst) } spin_unlock_bh(&dst_garbage.lock); } +EXPORT_SYMBOL(__dst_free); struct dst_entry *dst_destroy(struct dst_entry * dst) { @@ -223,15 +228,15 @@ again: child = dst->child; dst->hh = NULL; - if (hh && atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + if (hh) + hh_cache_put(hh); if (neigh) { dst->neighbour = NULL; neigh_release(neigh); } - atomic_dec(&dst->ops->entries); + dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); @@ -259,19 +264,47 @@ again: } return NULL; } +EXPORT_SYMBOL(dst_destroy); void dst_release(struct dst_entry *dst) { if (dst) { - int newrefcnt; - - smp_mb__before_atomic_dec(); - newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + WARN_ON(newrefcnt < 0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } EXPORT_SYMBOL(dst_release); +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + /* If dst not in cache, we must take a reference, because + * dst_release() will destroy dst as soon as its refcount becomes zero + */ + if (unlikely(dst->flags & DST_NOCACHE)) { + dst_hold(dst); + skb_dst_set(skb, dst); + } else { + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + } +} +EXPORT_SYMBOL(skb_dst_set_noref); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -280,8 +313,8 @@ EXPORT_SYMBOL(dst_release); * * Commented and originally written by Alexey. */ -static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) { if (dst->ops->ifdown) dst->ops->ifdown(dst, dev, unregister); @@ -303,7 +336,8 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, } } -static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int dst_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; struct dst_entry *dst, *last = NULL; @@ -326,9 +360,8 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void last->next = dst; else dst_busy_list = dst; - for (; dst; dst = dst->next) { + for (; dst; dst = dst->next) dst_ifdown(dst, dev, event != NETDEV_DOWN); - } mutex_unlock(&dst_gc_mutex); break; } @@ -337,13 +370,10 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void static struct notifier_block dst_dev_notifier = { .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ }; void __init dst_init(void) { register_netdevice_notifier(&dst_dev_notifier); } - -EXPORT_SYMBOL(__dst_free); -EXPORT_SYMBOL(dst_alloc); -EXPORT_SYMBOL(dst_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d9d5160610d..17741782a34 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -17,7 +17,10 @@ #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> -#include <asm/uaccess.h> +#include <linux/bitops.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> /* * Some useful ethtool_ops methods that're device independent. @@ -29,11 +32,19 @@ u32 ethtool_op_get_link(struct net_device *dev) { return netif_carrier_ok(dev) ? 1 : 0; } +EXPORT_SYMBOL(ethtool_op_get_link); + +u32 ethtool_op_get_rx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_ALL_CSUM) != 0; +} +EXPORT_SYMBOL(ethtool_op_get_rx_csum); u32 ethtool_op_get_tx_csum(struct net_device *dev) { return (dev->features & NETIF_F_ALL_CSUM) != 0; } +EXPORT_SYMBOL(ethtool_op_get_tx_csum); int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) { @@ -54,6 +65,7 @@ int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) { @@ -64,11 +76,13 @@ int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); u32 ethtool_op_get_sg(struct net_device *dev) { return (dev->features & NETIF_F_SG) != 0; } +EXPORT_SYMBOL(ethtool_op_get_sg); int ethtool_op_set_sg(struct net_device *dev, u32 data) { @@ -79,11 +93,13 @@ int ethtool_op_set_sg(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_sg); u32 ethtool_op_get_tso(struct net_device *dev) { return (dev->features & NETIF_F_TSO) != 0; } +EXPORT_SYMBOL(ethtool_op_get_tso); int ethtool_op_set_tso(struct net_device *dev, u32 data) { @@ -94,11 +110,13 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data) return 0; } +EXPORT_SYMBOL(ethtool_op_set_tso); u32 ethtool_op_get_ufo(struct net_device *dev) { return (dev->features & NETIF_F_UFO) != 0; } +EXPORT_SYMBOL(ethtool_op_get_ufo); int ethtool_op_set_ufo(struct net_device *dev, u32 data) { @@ -108,12 +126,14 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data) dev->features &= ~NETIF_F_UFO; return 0; } +EXPORT_SYMBOL(ethtool_op_set_ufo); /* the following list of flags are the same as their associated * NETIF_F_xxx values in include/linux/netdevice.h */ static const u32 flags_dup_features = - ETH_FLAG_LRO; + (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | + ETH_FLAG_RXHASH); u32 ethtool_op_get_flags(struct net_device *dev) { @@ -124,22 +144,36 @@ u32 ethtool_op_get_flags(struct net_device *dev) return dev->features & flags_dup_features; } +EXPORT_SYMBOL(ethtool_op_get_flags); -int ethtool_op_set_flags(struct net_device *dev, u32 data) +int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) { - if (data & ETH_FLAG_LRO) - dev->features |= NETIF_F_LRO; - else - dev->features &= ~NETIF_F_LRO; + if (data & ~supported) + return -EINVAL; + dev->features = ((dev->features & ~flags_dup_features) | + (data & flags_dup_features)); return 0; } +EXPORT_SYMBOL(ethtool_op_set_flags); + +void ethtool_ntuple_flush(struct net_device *dev) +{ + struct ethtool_rx_ntuple_flow_spec_container *fsc, *f; + + list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) { + list_del(&fsc->list); + kfree(fsc); + } + dev->ethtool_ntuple_list.count = 0; +} +EXPORT_SYMBOL(ethtool_ntuple_flush); /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; int err; if (!dev->ethtool_ops->get_settings) @@ -167,19 +201,30 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_settings(dev, &cmd); } -static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, + void __user *useraddr) { struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_drvinfo) - return -EOPNOTSUPP; - memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - ops->get_drvinfo(dev, &info); + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } - if (ops->get_sset_count) { + /* + * this method of obtaining string set info is deprecated; + * Use ETHTOOL_GSSET_INFO instead. + */ + if (ops && ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -191,17 +236,10 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; - } else { - /* code path for obsolete hooks */ - - if (ops->self_test_count) - info.testinfo_len = ops->self_test_count(dev); - if (ops->get_stats_count) - info.n_stats = ops->get_stats_count(dev); } - if (ops->get_regs_len) + if (ops && ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) + if (ops && ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -209,22 +247,94 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, + void __user *useraddr) { - struct ethtool_rxnfc cmd; + struct ethtool_sset_info info; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 sset_mask; + int i, idx = 0, n_bits = 0, ret, rc; + u32 *info_buf = NULL; + + if (!ops->get_sset_count) + return -EOPNOTSUPP; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + /* store copy of mask, because we zero struct later on */ + sset_mask = info.sset_mask; + if (!sset_mask) + return 0; + + /* calculate size of return buffer */ + n_bits = hweight64(sset_mask); + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GSSET_INFO; + + info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); + if (!info_buf) + return -ENOMEM; + + /* + * fill return buffer based on input bitmask and successful + * get_sset_count return + */ + for (i = 0; i < 64; i++) { + if (!(sset_mask & (1ULL << i))) + continue; + + rc = ops->get_sset_count(dev, i); + if (rc >= 0) { + info.sset_mask |= (1ULL << i); + info_buf[idx++] = rc; + } + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, sizeof(info))) + goto out; + + useraddr += offsetof(struct ethtool_sset_info, data); + if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + goto out; + + ret = 0; + +out: + kfree(info_buf); + return ret; +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; - return dev->ethtool_ops->set_rxnfc(dev, &cmd); + return dev->ethtool_ops->set_rxnfc(dev, &info); } -static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; + size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; int ret; void *rule_buf = NULL; @@ -232,13 +342,22 @@ static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) if (!ops->get_rxnfc) return -EOPNOTSUPP; - if (copy_from_user(&info, useraddr, sizeof(info))) + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { - rule_buf = kmalloc(info.rule_cnt * sizeof(u32), - GFP_USER); + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kzalloc(info.rule_cnt * sizeof(u32), + GFP_USER); if (!rule_buf) return -ENOMEM; } @@ -249,7 +368,7 @@ static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) goto err_out; ret = -EFAULT; - if (copy_to_user(useraddr, &info, sizeof(info))) + if (copy_to_user(useraddr, &info, info_size)) goto err_out; if (rule_buf) { @@ -266,6 +385,421 @@ err_out: return ret; } +static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxfh_indir *indir; + u32 table_size; + size_t full_size; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir) + return -EOPNOTSUPP; + + if (copy_from_user(&table_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(table_size))) + return -EFAULT; + + if (table_size > + (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) + return -ENOMEM; + full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; + indir = kzalloc(full_size, GFP_USER); + if (!indir) + return -ENOMEM; + + indir->cmd = ETHTOOL_GRXFHINDIR; + indir->size = table_size; + ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); + if (ret) + goto out; + + if (copy_to_user(useraddr, indir, full_size)) + ret = -EFAULT; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxfh_indir *indir; + u32 table_size; + size_t full_size; + int ret; + + if (!dev->ethtool_ops->set_rxfh_indir) + return -EOPNOTSUPP; + + if (copy_from_user(&table_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(table_size))) + return -EFAULT; + + if (table_size > + (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) + return -ENOMEM; + full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; + indir = kmalloc(full_size, GFP_USER); + if (!indir) + return -ENOMEM; + + if (copy_from_user(indir, useraddr, full_size)) { + ret = -EFAULT; + goto out; + } + + ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); + +out: + kfree(indir); + return ret; +} + +static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, + struct ethtool_rx_ntuple_flow_spec *spec, + struct ethtool_rx_ntuple_flow_spec_container *fsc) +{ + + /* don't add filters forever */ + if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) { + /* free the container */ + kfree(fsc); + return; + } + + /* Copy the whole filter over */ + fsc->fs.flow_type = spec->flow_type; + memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u)); + memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u)); + + fsc->fs.vlan_tag = spec->vlan_tag; + fsc->fs.vlan_tag_mask = spec->vlan_tag_mask; + fsc->fs.data = spec->data; + fsc->fs.data_mask = spec->data_mask; + fsc->fs.action = spec->action; + + /* add to the list */ + list_add_tail_rcu(&fsc->list, &list->list); + list->count++; +} + +/* + * ethtool does not (or did not) set masks for flow parameters that are + * not specified, so if both value and mask are 0 then this must be + * treated as equivalent to a mask with all bits set. Implement that + * here rather than in drivers. + */ +static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; + struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; + + if (fs->flow_type != TCP_V4_FLOW && + fs->flow_type != UDP_V4_FLOW && + fs->flow_type != SCTP_V4_FLOW) + return; + + if (!(entry->ip4src | mask->ip4src)) + mask->ip4src = htonl(0xffffffff); + if (!(entry->ip4dst | mask->ip4dst)) + mask->ip4dst = htonl(0xffffffff); + if (!(entry->psrc | mask->psrc)) + mask->psrc = htons(0xffff); + if (!(entry->pdst | mask->pdst)) + mask->pdst = htons(0xffff); + if (!(entry->tos | mask->tos)) + mask->tos = 0xff; + if (!(fs->vlan_tag | fs->vlan_tag_mask)) + fs->vlan_tag_mask = 0xffff; + if (!(fs->data | fs->data_mask)) + fs->data_mask = 0xffffffffffffffffULL; +} + +static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rx_ntuple cmd; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; + int ret; + + if (!(dev->features & NETIF_F_NTUPLE)) + return -EINVAL; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + rx_ntuple_fix_masks(&cmd.fs); + + /* + * Cache filter in dev struct for GET operation only if + * the underlying driver doesn't have its own GET operation, and + * only if the filter was added successfully. First make sure we + * can allocate the filter, then continue if successful. + */ + if (!ops->get_rx_ntuple) { + fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); + if (!fsc) + return -ENOMEM; + } + + ret = ops->set_rx_ntuple(dev, &cmd); + if (ret) { + kfree(fsc); + return ret; + } + + if (!ops->get_rx_ntuple) + __rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc); + + return ret; +} + +static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc; + u8 *data; + char *p; + int ret, i, num_strings = 0; + + if (!ops->get_sset_count) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + + data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + if (ops->get_rx_ntuple) { + /* driver-specific filter grab */ + ret = ops->get_rx_ntuple(dev, gstrings.string_set, data); + goto copy; + } + + /* default ethtool filter grab */ + i = 0; + p = (char *)data; + list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) { + sprintf(p, "Filter %d:\n", i); + p += ETH_GSTRING_LEN; + num_strings++; + + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + sprintf(p, "\tFlow Type: TCP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case UDP_V4_FLOW: + sprintf(p, "\tFlow Type: UDP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case SCTP_V4_FLOW: + sprintf(p, "\tFlow Type: SCTP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + sprintf(p, "\tFlow Type: AH ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case ESP_V4_FLOW: + sprintf(p, "\tFlow Type: ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tFlow Type: Raw IP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tFlow Type: IPv4\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + default: + sprintf(p, "\tFlow Type: Unknown\n"); + p += ETH_GSTRING_LEN; + num_strings++; + goto unknown_filter; + } + + /* now the rest of the filters */ + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.psrc, + fsc->fs.m_u.tcp_ip4_spec.psrc); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.pdst, + fsc->fs.m_u.tcp_ip4_spec.pdst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.tos, + fsc->fs.m_u.tcp_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + case ESP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSPI: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.spi, + fsc->fs.m_u.ah_ip4_spec.spi); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.tos, + fsc->fs.m_u.ah_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, + fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.tos, + fsc->fs.m_u.usr_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tIP Version: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip_ver, + fsc->fs.m_u.usr_ip4_spec.ip_ver); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tProtocol: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.proto, + fsc->fs.m_u.usr_ip4_spec.proto); + p += ETH_GSTRING_LEN; + num_strings++; + break; + } + sprintf(p, "\tVLAN: %d, mask: 0x%x\n", + fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask); + p += ETH_GSTRING_LEN; + num_strings++; + if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP) + sprintf(p, "\tAction: Drop\n"); + else + sprintf(p, "\tAction: Direct to queue %d\n", + fsc->fs.action); + p += ETH_GSTRING_LEN; + num_strings++; +unknown_filter: + i++; + } +copy: + /* indicate to userspace how many strings we actually have */ + gstrings.len = num_strings; + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; @@ -283,7 +817,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = kmalloc(reglen, GFP_USER); + regbuf = vmalloc(reglen); if (!regbuf) return -ENOMEM; @@ -298,13 +832,33 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) ret = 0; out: - kfree(regbuf); + vfree(regbuf); return ret; } +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { - struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; @@ -337,6 +891,20 @@ static int ethtool_nway_reset(struct net_device *dev) return dev->ethtool_ops->nway_reset(dev); } +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; @@ -436,9 +1004,10 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) return ret; } -static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, + void __user *useraddr) { - struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; @@ -450,7 +1019,8 @@ static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, + void __user *useraddr) { struct ethtool_coalesce coalesce; @@ -465,7 +1035,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; @@ -554,6 +1124,7 @@ static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_tx_csum(dev, edata.data); } +EXPORT_SYMBOL(ethtool_op_set_tx_csum); static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) { @@ -614,7 +1185,9 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data && !(dev->features & NETIF_F_SG)) return -EINVAL; - if (edata.data && !(dev->features & NETIF_F_HW_CSUM)) + if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) || + (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) return -EINVAL; return dev->ethtool_ops->set_ufo(dev, edata.data); } @@ -625,7 +1198,7 @@ static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) edata.data = dev->features & NETIF_F_GSO; if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; + return -EFAULT; return 0; } @@ -648,7 +1221,7 @@ static int ethtool_get_gro(struct net_device *dev, char __user *useraddr) edata.data = dev->features & NETIF_F_GRO; if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; + return -EFAULT; return 0; } @@ -660,8 +1233,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data) { - if (!dev->ethtool_ops->get_rx_csum || - !dev->ethtool_ops->get_rx_csum(dev)) + u32 rxcsum = dev->ethtool_ops->get_rx_csum ? + dev->ethtool_ops->get_rx_csum(dev) : + ethtool_op_get_rx_csum(dev); + + if (!rxcsum) return -EINVAL; dev->features |= NETIF_F_GRO; } else @@ -677,16 +1253,10 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) u64 *data; int ret, test_len; - if (!ops->self_test) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->self_test_count) + if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - test_len = ops->get_sset_count(dev, ETH_SS_TEST); - else - /* code path for obsolete hook */ - test_len = ops->self_test_count(dev); + test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); @@ -721,36 +1291,17 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) u8 *data; int ret; - if (!ops->get_strings) + if (!ops->get_strings || !ops->get_sset_count) return -EOPNOTSUPP; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; - if (ops->get_sset_count) { - ret = ops->get_sset_count(dev, gstrings.string_set); - if (ret < 0) - return ret; - - gstrings.len = ret; - } else { - /* code path for obsolete hooks */ + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; - switch (gstrings.string_set) { - case ETH_SS_TEST: - if (!ops->self_test_count) - return -EOPNOTSUPP; - gstrings.len = ops->self_test_count(dev); - break; - case ETH_SS_STATS: - if (!ops->get_stats_count) - return -EOPNOTSUPP; - gstrings.len = ops->get_stats_count(dev); - break; - default: - return -EINVAL; - } - } + gstrings.len = ret; data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); if (!data) @@ -791,16 +1342,10 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) u64 *data; int ret, n_stats; - if (!ops->get_ethtool_stats) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->get_stats_count) + if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - n_stats = ops->get_sset_count(dev, ETH_SS_STATS); - else - /* code path for obsolete hook */ - n_stats = ops->get_stats_count(dev); + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; WARN_ON(n_stats == 0); @@ -850,7 +1395,7 @@ static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { - struct ethtool_value edata = { cmd }; + struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; @@ -891,6 +1436,20 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, + char __user *useraddr) +{ + struct ethtool_flash efl; + + if (copy_from_user(&efl, useraddr, sizeof(efl))) + return -EFAULT; + + if (!dev->ethtool_ops->flash_device) + return -EOPNOTSUPP; + + return dev->ethtool_ops->flash_device(dev, &efl); +} + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -904,14 +1463,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - - if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + /* Allow some commands to be done by anyone */ - switch(ethcmd) { + switch (ethcmd) { + case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: @@ -925,6 +1492,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: + case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: @@ -938,10 +1506,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return -EPERM; } - if (dev->ethtool_ops->begin) - if ((rc = dev->ethtool_ops->begin(dev)) < 0) + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) return rc; - + } old_features = dev->features; switch (ethcmd) { @@ -975,8 +1544,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_link); + rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); @@ -1004,7 +1572,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) break; case ETHTOOL_GRXCSUM: rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_rx_csum); + (dev->ethtool_ops->get_rx_csum ? + dev->ethtool_ops->get_rx_csum : + ethtool_op_get_rx_csum)); break; case ETHTOOL_SRXCSUM: rc = ethtool_set_rx_csum(dev, useraddr); @@ -1068,7 +1638,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_flags); + (dev->ethtool_ops->get_flags ? + dev->ethtool_ops->get_flags : + ethtool_op_get_flags)); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, @@ -1087,12 +1659,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: - rc = ethtool_get_rxnfc(dev, useraddr); + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: - rc = ethtool_set_rxnfc(dev, useraddr); + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_GGRO: rc = ethtool_get_gro(dev, useraddr); @@ -1100,6 +1672,27 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SGRO: rc = ethtool_set_gro(dev, useraddr); break; + case ETHTOOL_FLASHDEV: + rc = ethtool_flash_device(dev, useraddr); + break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; + case ETHTOOL_SRXNTUPLE: + rc = ethtool_set_rx_ntuple(dev, useraddr); + break; + case ETHTOOL_GRXNTUPLE: + rc = ethtool_get_rx_ntuple(dev, useraddr); + break; + case ETHTOOL_GSSET_INFO: + rc = ethtool_get_sset_info(dev, useraddr); + break; + case ETHTOOL_GRXFHINDIR: + rc = ethtool_get_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_SRXFHINDIR: + rc = ethtool_set_rxfh_indir(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } @@ -1112,17 +1705,3 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return rc; } - -EXPORT_SYMBOL(ethtool_op_get_link); -EXPORT_SYMBOL(ethtool_op_get_sg); -EXPORT_SYMBOL(ethtool_op_get_tso); -EXPORT_SYMBOL(ethtool_op_get_tx_csum); -EXPORT_SYMBOL(ethtool_op_set_sg); -EXPORT_SYMBOL(ethtool_op_set_tso); -EXPORT_SYMBOL(ethtool_op_set_tx_csum); -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); -EXPORT_SYMBOL(ethtool_op_set_ufo); -EXPORT_SYMBOL(ethtool_op_get_ufo); -EXPORT_SYMBOL(ethtool_op_set_flags); -EXPORT_SYMBOL(ethtool_op_get_flags); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bd309384f8b..a20e5d3bbfa 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -38,6 +39,24 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); +u32 fib_default_rule_pref(struct fib_rules_ops *ops) +{ + struct list_head *pos; + struct fib_rule *rule; + + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { + rule = list_entry(pos->next, struct fib_rule, list); + if (rule->pref) + return rule->pref - 1; + } + } + + return 0; +} +EXPORT_SYMBOL(fib_default_rule_pref); + static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); @@ -72,7 +91,7 @@ static void flush_route_cache(struct fib_rules_ops *ops) ops->flush_cache(ops); } -int fib_rules_register(struct fib_rules_ops *ops) +static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; @@ -102,9 +121,30 @@ errout: return err; } +struct fib_rules_ops * +fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} EXPORT_SYMBOL_GPL(fib_rules_register); -void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; @@ -113,7 +153,15 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops) fib_rule_put(rule); } } -EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); + +static void fib_rules_put_rcu(struct rcu_head *head) +{ + struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); + struct net *net = ops->fro_net; + + release_net(net); + kfree(ops); +} void fib_rules_unregister(struct fib_rules_ops *ops) { @@ -124,10 +172,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops) fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); - synchronize_rcu(); - release_net(net); + call_rcu(&ops->rcu, fib_rules_put_rcu); } - EXPORT_SYMBOL_GPL(fib_rules_unregister); static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, @@ -135,7 +181,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->ifindex && (rule->ifindex != fl->iif)) + if (rule->iifindex && (rule->iifindex != fl->iif)) + goto out; + + if (rule->oifindex && (rule->oifindex != fl->oif)) goto out; if ((rule->mark ^ fl->mark) & rule->mark_mask) @@ -175,9 +224,12 @@ jumped: err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { - fib_rule_get(rule); - arg->rule = rule; - goto out; + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(atomic_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; } } @@ -187,7 +239,6 @@ out: return err; } - EXPORT_SYMBOL_GPL(fib_rules_lookup); static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, @@ -248,14 +299,24 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); - if (tb[FRA_IFNAME]) { + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + rule->iifindex = -1; + nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->iifname); + if (dev) + rule->iifindex = dev->ifindex; + } + + if (tb[FRA_OIFNAME]) { struct net_device *dev; - rule->ifindex = -1; - nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->ifname); + rule->oifindex = -1; + nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->oifname); if (dev) - rule->ifindex = dev->ifindex; + rule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -274,7 +335,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (!rule->pref && ops->default_pref) + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); err = -EINVAL; @@ -289,12 +350,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { - rule->ctarget = r; + RCU_INIT_POINTER(rule->ctarget, r); break; } } - if (rule->ctarget == NULL) + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; } else if (rule->action == FR_ACT_GOTO) goto errout_free; @@ -311,6 +372,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) fib_rule_get(rule); + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if @@ -319,7 +385,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref) { - BUG_ON(r->ctarget != NULL); + BUG_ON(rtnl_dereference(r->ctarget) != NULL); rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; @@ -333,11 +399,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - if (last) - list_add_rcu(&rule->list, &last->list); - else - list_add_rcu(&rule->list, &ops->rules_list); - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); flush_route_cache(ops); rules_ops_put(ops); @@ -388,8 +449,12 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) continue; - if (tb[FRA_IFNAME] && - nla_strcmp(tb[FRA_IFNAME], rule->ifname)) + if (tb[FRA_IIFNAME] && + nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) + continue; + + if (tb[FRA_OIFNAME] && + nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) continue; if (tb[FRA_FWMARK] && @@ -421,14 +486,13 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) */ if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { - if (tmp->ctarget == rule) { + if (rtnl_dereference(tmp->ctarget) == rule) { rcu_assign_pointer(tmp->ctarget, NULL); ops->unresolved_rules++; } } } - synchronize_rcu(); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); fib_rule_put(rule); @@ -447,7 +511,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) - + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_FWMARK */ @@ -471,6 +536,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, return -EMSGSIZE; frh = nlmsg_data(nlh); + frh->family = ops->family; frh->table = rule->table; NLA_PUT_U32(skb, FRA_TABLE, rule->table); frh->res1 = 0; @@ -478,14 +544,22 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; - if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) + if (rule->action == FR_ACT_GOTO && + rcu_dereference_raw(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; - if (rule->ifname[0]) { - NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname); + if (rule->iifname[0]) { + NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); + + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; + } + + if (rule->oifname[0]) { + NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); - if (rule->ifindex == -1) - frh->flags |= FIB_RULE_DEV_DETACHED; + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; } if (rule->pref) @@ -558,7 +632,7 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) break; cb->args[1] = 0; - skip: +skip: idx++; } rcu_read_unlock(); @@ -600,9 +674,12 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->ifindex == -1 && - strcmp(dev->name, rule->ifname) == 0) - rule->ifindex = dev->ifindex; + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; } } @@ -610,9 +687,12 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; - list_for_each_entry(rule, rules, list) - if (rule->ifindex == dev->ifindex) - rule->ifindex = -1; + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } } @@ -624,7 +704,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct fib_rules_ops *ops; ASSERT_RTNL(); - rcu_read_lock(); switch (event) { case NETDEV_REGISTER: @@ -638,8 +717,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, break; } - rcu_read_unlock(); - return NOTIFY_DONE; } @@ -647,7 +724,7 @@ static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; -static int fib_rules_net_init(struct net *net) +static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); diff --git a/net/core/filter.c b/net/core/filter.c index d1d779ca096..afc58374ca9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -25,6 +25,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> +#include <linux/gfp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> @@ -36,9 +37,69 @@ #include <asm/uaccess.h> #include <asm/unaligned.h> #include <linux/filter.h> +#include <linux/reciprocal_div.h> + +enum { + BPF_S_RET_K = 1, + BPF_S_RET_A, + BPF_S_ALU_ADD_K, + BPF_S_ALU_ADD_X, + BPF_S_ALU_SUB_K, + BPF_S_ALU_SUB_X, + BPF_S_ALU_MUL_K, + BPF_S_ALU_MUL_X, + BPF_S_ALU_DIV_X, + BPF_S_ALU_AND_K, + BPF_S_ALU_AND_X, + BPF_S_ALU_OR_K, + BPF_S_ALU_OR_X, + BPF_S_ALU_LSH_K, + BPF_S_ALU_LSH_X, + BPF_S_ALU_RSH_K, + BPF_S_ALU_RSH_X, + BPF_S_ALU_NEG, + BPF_S_LD_W_ABS, + BPF_S_LD_H_ABS, + BPF_S_LD_B_ABS, + BPF_S_LD_W_LEN, + BPF_S_LD_W_IND, + BPF_S_LD_H_IND, + BPF_S_LD_B_IND, + BPF_S_LD_IMM, + BPF_S_LDX_W_LEN, + BPF_S_LDX_B_MSH, + BPF_S_LDX_IMM, + BPF_S_MISC_TAX, + BPF_S_MISC_TXA, + BPF_S_ALU_DIV_K, + BPF_S_LD_MEM, + BPF_S_LDX_MEM, + BPF_S_ST, + BPF_S_STX, + BPF_S_JMP_JA, + BPF_S_JMP_JEQ_K, + BPF_S_JMP_JEQ_X, + BPF_S_JMP_JGE_K, + BPF_S_JMP_JGE_X, + BPF_S_JMP_JGT_K, + BPF_S_JMP_JGT_X, + BPF_S_JMP_JSET_K, + BPF_S_JMP_JSET_X, + /* Ancillary data */ + BPF_S_ANC_PROTOCOL, + BPF_S_ANC_PKTTYPE, + BPF_S_ANC_IFINDEX, + BPF_S_ANC_NLATTR, + BPF_S_ANC_NLATTR_NEST, + BPF_S_ANC_MARK, + BPF_S_ANC_QUEUE, + BPF_S_ANC_HATYPE, + BPF_S_ANC_RXHASH, + BPF_S_ANC_CPU, +}; /* No hurry in this branch */ -static void *__load_pointer(struct sk_buff *skb, int k) +static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; @@ -47,21 +108,17 @@ static void *__load_pointer(struct sk_buff *skb, int k) else if (k >= SKF_LL_OFF) ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr < skb_tail_pointer(skb)) + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } -static inline void *load_pointer(struct sk_buff *skb, int k, +static inline void *load_pointer(const struct sk_buff *skb, int k, unsigned int size, void *buffer) { if (k >= 0) return skb_header_pointer(skb, k, size, buffer); - else { - if (k >= SKF_AD_OFF) - return NULL; - return __load_pointer(skb, k); - } + return __load_pointer(skb, k, size); } /** @@ -86,10 +143,10 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) return err; rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_bh(sk->sk_filter); if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns); + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } rcu_read_unlock_bh(); @@ -101,209 +158,222 @@ EXPORT_SYMBOL(sk_filter); /** * sk_run_filter - run a filter on a socket * @skb: buffer to run the filter on - * @filter: filter to apply - * @flen: length of filter + * @fentry: filter to apply * * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. skb is the data we are - * filtering, filter is the array of filter instructions, and - * len is the number of filter blocks in the array. + * Return length to keep, 0 for none. @skb is the data we are + * filtering, @filter is the array of filter instructions. + * Because all jumps are guaranteed to be before last instruction, + * and last instruction guaranteed to be a RET, we dont need to check + * flen. (We used to pass to this function the length of filter) */ -unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +unsigned int sk_run_filter(const struct sk_buff *skb, + const struct sock_filter *fentry) { - struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ u32 tmp; int k; - int pc; /* * Process array of filter instructions. */ - for (pc = 0; pc < flen; pc++) { - fentry = &filter[pc]; + for (;; fentry++) { +#if defined(CONFIG_X86_32) +#define K (fentry->k) +#else + const u32 K = fentry->k; +#endif switch (fentry->code) { - case BPF_ALU|BPF_ADD|BPF_X: + case BPF_S_ALU_ADD_X: A += X; continue; - case BPF_ALU|BPF_ADD|BPF_K: - A += fentry->k; + case BPF_S_ALU_ADD_K: + A += K; continue; - case BPF_ALU|BPF_SUB|BPF_X: + case BPF_S_ALU_SUB_X: A -= X; continue; - case BPF_ALU|BPF_SUB|BPF_K: - A -= fentry->k; + case BPF_S_ALU_SUB_K: + A -= K; continue; - case BPF_ALU|BPF_MUL|BPF_X: + case BPF_S_ALU_MUL_X: A *= X; continue; - case BPF_ALU|BPF_MUL|BPF_K: - A *= fentry->k; + case BPF_S_ALU_MUL_K: + A *= K; continue; - case BPF_ALU|BPF_DIV|BPF_X: + case BPF_S_ALU_DIV_X: if (X == 0) return 0; A /= X; continue; - case BPF_ALU|BPF_DIV|BPF_K: - A /= fentry->k; + case BPF_S_ALU_DIV_K: + A = reciprocal_divide(A, K); continue; - case BPF_ALU|BPF_AND|BPF_X: + case BPF_S_ALU_AND_X: A &= X; continue; - case BPF_ALU|BPF_AND|BPF_K: - A &= fentry->k; + case BPF_S_ALU_AND_K: + A &= K; continue; - case BPF_ALU|BPF_OR|BPF_X: + case BPF_S_ALU_OR_X: A |= X; continue; - case BPF_ALU|BPF_OR|BPF_K: - A |= fentry->k; + case BPF_S_ALU_OR_K: + A |= K; continue; - case BPF_ALU|BPF_LSH|BPF_X: + case BPF_S_ALU_LSH_X: A <<= X; continue; - case BPF_ALU|BPF_LSH|BPF_K: - A <<= fentry->k; + case BPF_S_ALU_LSH_K: + A <<= K; continue; - case BPF_ALU|BPF_RSH|BPF_X: + case BPF_S_ALU_RSH_X: A >>= X; continue; - case BPF_ALU|BPF_RSH|BPF_K: - A >>= fentry->k; + case BPF_S_ALU_RSH_K: + A >>= K; continue; - case BPF_ALU|BPF_NEG: + case BPF_S_ALU_NEG: A = -A; continue; - case BPF_JMP|BPF_JA: - pc += fentry->k; + case BPF_S_JMP_JA: + fentry += K; continue; - case BPF_JMP|BPF_JGT|BPF_K: - pc += (A > fentry->k) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JGT_K: + fentry += (A > K) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JGE|BPF_K: - pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JGE_K: + fentry += (A >= K) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JEQ|BPF_K: - pc += (A == fentry->k) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JEQ_K: + fentry += (A == K) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JSET|BPF_K: - pc += (A & fentry->k) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JSET_K: + fentry += (A & K) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JGT|BPF_X: - pc += (A > X) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JGT_X: + fentry += (A > X) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JGE|BPF_X: - pc += (A >= X) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JGE_X: + fentry += (A >= X) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JEQ|BPF_X: - pc += (A == X) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JEQ_X: + fentry += (A == X) ? fentry->jt : fentry->jf; continue; - case BPF_JMP|BPF_JSET|BPF_X: - pc += (A & X) ? fentry->jt : fentry->jf; + case BPF_S_JMP_JSET_X: + fentry += (A & X) ? fentry->jt : fentry->jf; continue; - case BPF_LD|BPF_W|BPF_ABS: - k = fentry->k; + case BPF_S_LD_W_ABS: + k = K; load_w: ptr = load_pointer(skb, k, 4, &tmp); if (ptr != NULL) { A = get_unaligned_be32(ptr); continue; } - break; - case BPF_LD|BPF_H|BPF_ABS: - k = fentry->k; + return 0; + case BPF_S_LD_H_ABS: + k = K; load_h: ptr = load_pointer(skb, k, 2, &tmp); if (ptr != NULL) { A = get_unaligned_be16(ptr); continue; } - break; - case BPF_LD|BPF_B|BPF_ABS: - k = fentry->k; + return 0; + case BPF_S_LD_B_ABS: + k = K; load_b: ptr = load_pointer(skb, k, 1, &tmp); if (ptr != NULL) { A = *(u8 *)ptr; continue; } - break; - case BPF_LD|BPF_W|BPF_LEN: + return 0; + case BPF_S_LD_W_LEN: A = skb->len; continue; - case BPF_LDX|BPF_W|BPF_LEN: + case BPF_S_LDX_W_LEN: X = skb->len; continue; - case BPF_LD|BPF_W|BPF_IND: - k = X + fentry->k; + case BPF_S_LD_W_IND: + k = X + K; goto load_w; - case BPF_LD|BPF_H|BPF_IND: - k = X + fentry->k; + case BPF_S_LD_H_IND: + k = X + K; goto load_h; - case BPF_LD|BPF_B|BPF_IND: - k = X + fentry->k; + case BPF_S_LD_B_IND: + k = X + K; goto load_b; - case BPF_LDX|BPF_B|BPF_MSH: - ptr = load_pointer(skb, fentry->k, 1, &tmp); + case BPF_S_LDX_B_MSH: + ptr = load_pointer(skb, K, 1, &tmp); if (ptr != NULL) { X = (*(u8 *)ptr & 0xf) << 2; continue; } return 0; - case BPF_LD|BPF_IMM: - A = fentry->k; + case BPF_S_LD_IMM: + A = K; continue; - case BPF_LDX|BPF_IMM: - X = fentry->k; + case BPF_S_LDX_IMM: + X = K; continue; - case BPF_LD|BPF_MEM: - A = mem[fentry->k]; + case BPF_S_LD_MEM: + A = mem[K]; continue; - case BPF_LDX|BPF_MEM: - X = mem[fentry->k]; + case BPF_S_LDX_MEM: + X = mem[K]; continue; - case BPF_MISC|BPF_TAX: + case BPF_S_MISC_TAX: X = A; continue; - case BPF_MISC|BPF_TXA: + case BPF_S_MISC_TXA: A = X; continue; - case BPF_RET|BPF_K: - return fentry->k; - case BPF_RET|BPF_A: + case BPF_S_RET_K: + return K; + case BPF_S_RET_A: return A; - case BPF_ST: - mem[fentry->k] = A; + case BPF_S_ST: + mem[K] = A; continue; - case BPF_STX: - mem[fentry->k] = X; + case BPF_S_STX: + mem[K] = X; continue; - default: - WARN_ON(1); - return 0; - } - - /* - * Handle ancillary data, which are impossible - * (or very difficult) to get parsing packet contents. - */ - switch (k-SKF_AD_OFF) { - case SKF_AD_PROTOCOL: + case BPF_S_ANC_PROTOCOL: A = ntohs(skb->protocol); continue; - case SKF_AD_PKTTYPE: + case BPF_S_ANC_PKTTYPE: A = skb->pkt_type; continue; - case SKF_AD_IFINDEX: + case BPF_S_ANC_IFINDEX: + if (!skb->dev) + return 0; A = skb->dev->ifindex; continue; - case SKF_AD_NLATTR: { + case BPF_S_ANC_MARK: + A = skb->mark; + continue; + case BPF_S_ANC_QUEUE: + A = skb->queue_mapping; + continue; + case BPF_S_ANC_HATYPE: + if (!skb->dev) + return 0; + A = skb->dev->type; + continue; + case BPF_S_ANC_RXHASH: + A = skb->rxhash; + continue; + case BPF_S_ANC_CPU: + A = raw_smp_processor_id(); + continue; + case BPF_S_ANC_NLATTR: { struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -319,7 +389,7 @@ load_b: A = 0; continue; } - case SKF_AD_NLATTR_NEST: { + case BPF_S_ANC_NLATTR_NEST: { struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -339,6 +409,7 @@ load_b: continue; } default: + WARN_ON(1); return 0; } } @@ -347,6 +418,66 @@ load_b: } EXPORT_SYMBOL(sk_run_filter); +/* + * Security : + * A BPF program is able to use 16 cells of memory to store intermediate + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * As we dont want to clear mem[] array for each packet going through + * sk_run_filter(), we check that filter loaded by user never try to read + * a cell if not previously written, and we check all branches to be sure + * a malicious user doesnt try to abuse us. + */ +static int check_load_and_stores(struct sock_filter *filter, int flen) +{ + u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + int pc, ret = 0; + + BUILD_BUG_ON(BPF_MEMWORDS > 16); + masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + if (!masks) + return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); + + for (pc = 0; pc < flen; pc++) { + memvalid &= masks[pc]; + + switch (filter[pc].code) { + case BPF_S_ST: + case BPF_S_STX: + memvalid |= (1 << filter[pc].k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + if (!(memvalid & (1 << filter[pc].k))) { + ret = -EINVAL; + goto error; + } + break; + case BPF_S_JMP_JA: + /* a jump must set masks on target */ + masks[pc + 1 + filter[pc].k] &= memvalid; + memvalid = ~0; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* a jump must set masks on targets */ + masks[pc + 1 + filter[pc].jt] &= memvalid; + masks[pc + 1 + filter[pc].jf] &= memvalid; + memvalid = ~0; + break; + } + } +error: + kfree(masks); + return ret; +} + /** * sk_chk_filter - verify socket filter code * @filter: filter to verify @@ -363,7 +494,57 @@ EXPORT_SYMBOL(sk_run_filter); */ int sk_chk_filter(struct sock_filter *filter, int flen) { - struct sock_filter *ftest; + /* + * Valid instructions are initialized to non-0. + * Invalid instructions are initialized to 0. + */ + static const u8 codes[] = { + [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, + [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, + [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, + [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, + [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, + [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, + [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, + [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, + [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, + [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, + [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, + [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, + [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, + [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, + [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, + [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, + [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, + [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, + [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, + [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, + [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, + [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, + [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, + [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, + [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, + [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, + [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, + [BPF_RET|BPF_K] = BPF_S_RET_K, + [BPF_RET|BPF_A] = BPF_S_RET_A, + [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, + [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, + [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, + [BPF_ST] = BPF_S_ST, + [BPF_STX] = BPF_S_STX, + [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, + [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, + [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, + [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, + [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, + [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, + [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, + [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, + [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, + }; int pc; if (flen == 0 || flen > BPF_MAXINSNS) @@ -371,61 +552,31 @@ int sk_chk_filter(struct sock_filter *filter, int flen) /* check the filter code now */ for (pc = 0; pc < flen; pc++) { - ftest = &filter[pc]; - - /* Only allow valid instructions */ - switch (ftest->code) { - case BPF_ALU|BPF_ADD|BPF_K: - case BPF_ALU|BPF_ADD|BPF_X: - case BPF_ALU|BPF_SUB|BPF_K: - case BPF_ALU|BPF_SUB|BPF_X: - case BPF_ALU|BPF_MUL|BPF_K: - case BPF_ALU|BPF_MUL|BPF_X: - case BPF_ALU|BPF_DIV|BPF_X: - case BPF_ALU|BPF_AND|BPF_K: - case BPF_ALU|BPF_AND|BPF_X: - case BPF_ALU|BPF_OR|BPF_K: - case BPF_ALU|BPF_OR|BPF_X: - case BPF_ALU|BPF_LSH|BPF_K: - case BPF_ALU|BPF_LSH|BPF_X: - case BPF_ALU|BPF_RSH|BPF_K: - case BPF_ALU|BPF_RSH|BPF_X: - case BPF_ALU|BPF_NEG: - case BPF_LD|BPF_W|BPF_ABS: - case BPF_LD|BPF_H|BPF_ABS: - case BPF_LD|BPF_B|BPF_ABS: - case BPF_LD|BPF_W|BPF_LEN: - case BPF_LD|BPF_W|BPF_IND: - case BPF_LD|BPF_H|BPF_IND: - case BPF_LD|BPF_B|BPF_IND: - case BPF_LD|BPF_IMM: - case BPF_LDX|BPF_W|BPF_LEN: - case BPF_LDX|BPF_B|BPF_MSH: - case BPF_LDX|BPF_IMM: - case BPF_MISC|BPF_TAX: - case BPF_MISC|BPF_TXA: - case BPF_RET|BPF_K: - case BPF_RET|BPF_A: - break; + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + if (code >= ARRAY_SIZE(codes)) + return -EINVAL; + code = codes[code]; + if (!code) + return -EINVAL; /* Some instructions need special checks */ - - case BPF_ALU|BPF_DIV|BPF_K: + switch (code) { + case BPF_S_ALU_DIV_K: /* check for division by zero */ if (ftest->k == 0) return -EINVAL; + ftest->k = reciprocal_value(ftest->k); break; - - case BPF_LD|BPF_MEM: - case BPF_LDX|BPF_MEM: - case BPF_ST: - case BPF_STX: + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: /* check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; - - case BPF_JMP|BPF_JA: + case BPF_S_JMP_JA: /* * Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, @@ -434,48 +585,62 @@ int sk_chk_filter(struct sock_filter *filter, int flen) if (ftest->k >= (unsigned)(flen-pc-1)) return -EINVAL; break; - - case BPF_JMP|BPF_JEQ|BPF_K: - case BPF_JMP|BPF_JEQ|BPF_X: - case BPF_JMP|BPF_JGE|BPF_K: - case BPF_JMP|BPF_JGE|BPF_X: - case BPF_JMP|BPF_JGT|BPF_K: - case BPF_JMP|BPF_JGT|BPF_X: - case BPF_JMP|BPF_JSET|BPF_K: - case BPF_JMP|BPF_JSET|BPF_X: + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: /* for conditionals both must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; - - default: - return -EINVAL; + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: +#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ + code = BPF_S_ANC_##CODE; \ + break + switch (ftest->k) { + ANCILLARY(PROTOCOL); + ANCILLARY(PKTTYPE); + ANCILLARY(IFINDEX); + ANCILLARY(NLATTR); + ANCILLARY(NLATTR_NEST); + ANCILLARY(MARK); + ANCILLARY(QUEUE); + ANCILLARY(HATYPE); + ANCILLARY(RXHASH); + ANCILLARY(CPU); + } } + ftest->code = code; } - return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL; + /* last instruction must be a RET code */ + switch (filter[flen - 1].code) { + case BPF_S_RET_K: + case BPF_S_RET_A: + return check_load_and_stores(filter, flen); + } + return -EINVAL; } EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release: Release a socket filter by rcu_head + * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_filter_release(fp); -} - -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) -{ - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); - call_rcu_bh(&fp->rcu, sk_filter_rcu_release); + kfree(fp); } +EXPORT_SYMBOL(sk_filter_release_rcu); /** * sk_attach_filter - attach a socket filter @@ -514,28 +679,28 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return err; } - rcu_read_lock_bh(); - old_fp = rcu_dereference(sk->sk_filter); + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); - rcu_read_unlock_bh(); if (old_fp) - sk_filter_delayed_uncharge(sk, old_fp); + sk_filter_uncharge(sk, old_fp); return 0; } +EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; - rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_delayed_uncharge(sk, filter); + sk_filter_uncharge(sk, filter); ret = 0; } - rcu_read_unlock_bh(); return ret; } +EXPORT_SYMBOL_GPL(sk_detach_filter); diff --git a/net/core/flow.c b/net/core/flow.c index 96015871ece..127c8a7ffd6 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -26,120 +26,161 @@ #include <linux/security.h> struct flow_cache_entry { - struct flow_cache_entry *next; - u16 family; - u8 dir; - u32 genid; - struct flowi key; - void *object; - atomic_t *object_ref; + union { + struct hlist_node hlist; + struct list_head gc_list; + } u; + u16 family; + u8 dir; + u32 genid; + struct flowi key; + struct flow_cache_object *object; }; -atomic_t flow_cache_genid = ATOMIC_INIT(0); - -static u32 flow_hash_shift; -#define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; - -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) - -static struct kmem_cache *flow_cachep __read_mostly; - -static int flow_lwm, flow_hwm; - -struct flow_percpu_info { - int hash_rnd_recalc; - u32 hash_rnd; - int count; +struct flow_cache_percpu { + struct hlist_head *hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; + struct tasklet_struct flush_tasklet; }; -static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; -#define flow_hash_rnd_recalc(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) -#define flow_hash_rnd(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd) -#define flow_count(cpu) \ - (per_cpu(flow_hash_info, cpu).count) +struct flow_flush_info { + struct flow_cache *cache; + atomic_t cpuleft; + struct completion completion; +}; -static struct timer_list flow_hash_rnd_timer; +struct flow_cache { + u32 hash_shift; + struct flow_cache_percpu __percpu *percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; + struct timer_list rnd_timer; +}; -#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) +atomic_t flow_cache_genid = ATOMIC_INIT(0); +EXPORT_SYMBOL(flow_cache_genid); +static struct flow_cache flow_cache_global; +static struct kmem_cache *flow_cachep __read_mostly; -struct flow_flush_info { - atomic_t cpuleft; - struct completion completion; -}; -static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; +static DEFINE_SPINLOCK(flow_cache_gc_lock); +static LIST_HEAD(flow_cache_gc_list); -#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) +#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) static void flow_cache_new_hashrnd(unsigned long arg) { + struct flow_cache *fc = (void *) arg; int i; for_each_possible_cpu(i) - flow_hash_rnd_recalc(i) = 1; + per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); } -static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) +static int flow_entry_valid(struct flow_cache_entry *fle) +{ + if (atomic_read(&flow_cache_genid) != fle->genid) + return 0; + if (fle->object && !fle->object->ops->check(fle->object)) + return 0; + return 1; +} + +static void flow_entry_kill(struct flow_cache_entry *fle) { if (fle->object) - atomic_dec(fle->object_ref); + fle->object->ops->delete(fle->object); kmem_cache_free(flow_cachep, fle); - flow_count(cpu)--; } -static void __flow_cache_shrink(int cpu, int shrink_to) +static void flow_cache_gc_task(struct work_struct *work) { - struct flow_cache_entry *fle, **flp; - int i; + struct list_head gc_list; + struct flow_cache_entry *fce, *n; - for (i = 0; i < flow_hash_size; i++) { - int k = 0; + INIT_LIST_HEAD(&gc_list); + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail_init(&flow_cache_gc_list, &gc_list); + spin_unlock_bh(&flow_cache_gc_lock); - flp = &flow_table(cpu)[i]; - while ((fle = *flp) != NULL && k < shrink_to) { - k++; - flp = &fle->next; - } - while ((fle = *flp) != NULL) { - *flp = fle->next; - flow_entry_kill(cpu, fle); - } + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + flow_entry_kill(fce); +} +static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); + +static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, + int deleted, struct list_head *gc_list) +{ + if (deleted) { + fcp->hash_count -= deleted; + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail(gc_list, &flow_cache_gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + schedule_work(&flow_cache_gc_work); } } -static void flow_cache_shrink(int cpu) +static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) { - int shrink_to = flow_lwm / flow_hash_size; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + for (i = 0; i < flow_cache_hash_size(fc); i++) { + int saved = 0; + + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (saved < shrink_to && + flow_entry_valid(fle)) { + saved++; + } else { + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); + } + } + } - __flow_cache_shrink(cpu, shrink_to); + flow_cache_queue_garbage(fcp, deleted, &gc_list); } -static void flow_new_hash_rnd(int cpu) +static void flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); - flow_hash_rnd_recalc(cpu) = 0; + int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + + __flow_cache_shrink(fc, fcp, shrink_to); +} - __flow_cache_shrink(cpu, 0); +static void flow_new_hash_rnd(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + get_random_bytes(&fcp->hash_rnd, sizeof(u32)); + fcp->hash_rnd_recalc = 0; + __flow_cache_shrink(fc, fcp, 0); } -static u32 flow_hash_code(struct flowi *key, int cpu) +static u32 flow_hash_code(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + struct flowi *key) { u32 *k = (u32 *) key; - return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & - (flow_hash_size - 1)); + return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1); } -#if (BITS_PER_LONG == 64) -typedef u64 flow_compare_t; -#else -typedef u32 flow_compare_t; -#endif +typedef unsigned long flow_compare_t; /* I hear what you're saying, use memcmp. But memcmp cannot make * important assumptions that we can here, such as alignment and @@ -165,114 +206,118 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver) +struct flow_cache_object * +flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver, void *ctx) { - struct flow_cache_entry *fle, **head; + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle, *tfle; + struct hlist_node *entry; + struct flow_cache_object *flo; unsigned int hash; - int cpu; local_bh_disable(); - cpu = smp_processor_id(); + fcp = this_cpu_ptr(fc->percpu); fle = NULL; + flo = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!fcp->hash_table) goto nocache; - if (flow_hash_rnd_recalc(cpu)) - flow_new_hash_rnd(cpu); - hash = flow_hash_code(key, cpu); - - head = &flow_table(cpu)[hash]; - for (fle = *head; fle; fle = fle->next) { - if (fle->family == family && - fle->dir == dir && - flow_key_compare(key, &fle->key) == 0) { - if (fle->genid == atomic_read(&flow_cache_genid)) { - void *ret = fle->object; + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); - if (ret) - atomic_inc(fle->object_ref); - local_bh_enable(); - - return ret; - } + hash = flow_hash_code(fc, fcp, key); + hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + if (tfle->family == family && + tfle->dir == dir && + flow_key_compare(key, &tfle->key) == 0) { + fle = tfle; break; } } - if (!fle) { - if (flow_count(cpu) > flow_hwm) - flow_cache_shrink(cpu); + if (unlikely(!fle)) { + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { - fle->next = *head; - *head = fle; fle->family = family; fle->dir = dir; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; - flow_count(cpu)++; + hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); + fcp->hash_count++; } + } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + flo = fle->object; + if (!flo) + goto ret_object; + flo = flo->ops->get(flo); + if (flo) + goto ret_object; + } else if (fle->object) { + flo = fle->object; + flo->ops->delete(flo); + fle->object = NULL; } nocache: - { - int err; - void *obj; - atomic_t *obj_ref; - - err = resolver(net, key, family, dir, &obj, &obj_ref); - - if (fle && !err) { - fle->genid = atomic_read(&flow_cache_genid); - - if (fle->object) - atomic_dec(fle->object_ref); - - fle->object = obj; - fle->object_ref = obj_ref; - if (obj) - atomic_inc(fle->object_ref); - } - local_bh_enable(); - - if (err) - obj = ERR_PTR(err); - return obj; + flo = NULL; + if (fle) { + flo = fle->object; + fle->object = NULL; + } + flo = resolver(net, key, family, dir, flo, ctx); + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + if (!IS_ERR(flo)) + fle->object = flo; + else + fle->genid--; + } else { + if (flo && !IS_ERR(flo)) + flo->ops->delete(flo); } +ret_object: + local_bh_enable(); + return flo; } +EXPORT_SYMBOL(flow_cache_lookup); static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; - int i; - int cpu; - - cpu = smp_processor_id(); - for (i = 0; i < flow_hash_size; i++) { - struct flow_cache_entry *fle; - - fle = flow_table(cpu)[i]; - for (; fle; fle = fle->next) { - unsigned genid = atomic_read(&flow_cache_genid); - - if (!fle->object || fle->genid == genid) + struct flow_cache *fc = info->cache; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + fcp = this_cpu_ptr(fc->percpu); + for (i = 0; i < flow_cache_hash_size(fc); i++) { + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (flow_entry_valid(fle)) continue; - fle->object = NULL; - atomic_dec(fle->object_ref); + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); } } + flow_cache_queue_garbage(fcp, deleted, &gc_list); + if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); } -static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; @@ -280,8 +325,7 @@ static void flow_cache_flush_per_cpu(void *data) struct tasklet_struct *tasklet; cpu = smp_processor_id(); - - tasklet = flow_flush_tasklet(cpu); + tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; tasklet->data = (unsigned long)info; tasklet_schedule(tasklet); } @@ -294,6 +338,7 @@ void flow_cache_flush(void) /* Don't want cpus going down or up during this. */ get_online_cpus(); mutex_lock(&flow_flush_sem); + info.cache = &flow_cache_global; atomic_set(&info.cpuleft, num_online_cpus()); init_completion(&info.completion); @@ -307,62 +352,83 @@ void flow_cache_flush(void) put_online_cpus(); } -static void __init flow_cache_cpu_prepare(int cpu) +static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { - struct tasklet_struct *tasklet; - unsigned long order; - - for (order = 0; - (PAGE_SIZE << order) < - (sizeof(struct flow_cache_entry *)*flow_hash_size); - order++) - /* NOTHING */; - - flow_table(cpu) = (struct flow_cache_entry **) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!flow_table(cpu)) - panic("NET: failed to allocate flow cache order %lu\n", order); - - flow_hash_rnd_recalc(cpu) = 1; - flow_count(cpu) = 0; - - tasklet = flow_flush_tasklet(cpu); - tasklet_init(tasklet, flow_cache_flush_tasklet, 0); + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); + + if (!fcp->hash_table) { + fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); + if (!fcp->hash_table) { + pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + return -ENOMEM; + } + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + return 0; } -static int flow_cache_cpu(struct notifier_block *nfb, +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - __flow_cache_shrink((unsigned long)hcpu, 0); + struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + int res, cpu = (unsigned long) hcpu; + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + res = flow_cache_cpu_prepare(fc, cpu); + if (res) + return notifier_from_errno(res); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __flow_cache_shrink(fc, fcp, 0); + break; + } return NOTIFY_OK; } -static int __init flow_cache_init(void) +static int __init flow_cache_init(struct flow_cache *fc) { int i; - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, - NULL); - flow_hash_shift = 10; - flow_lwm = 2 * flow_hash_size; - flow_hwm = 4 * flow_hash_size; + fc->hash_shift = 10; + fc->low_watermark = 2 * flow_cache_hash_size(fc); + fc->high_watermark = 4 * flow_cache_hash_size(fc); - setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + fc->percpu = alloc_percpu(struct flow_cache_percpu); + if (!fc->percpu) + return -ENOMEM; - for_each_possible_cpu(i) - flow_cache_cpu_prepare(i); + for_each_online_cpu(i) { + if (flow_cache_cpu_prepare(fc, i)) + return -ENOMEM; + } + fc->hotcpu_notifier = (struct notifier_block){ + .notifier_call = flow_cache_cpu, + }; + register_hotcpu_notifier(&fc->hotcpu_notifier); + + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); - hotcpu_notifier(flow_cache_cpu, 0); return 0; } -module_init(flow_cache_init); +static int __init flow_cache_init_global(void) +{ + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); -EXPORT_SYMBOL(flow_cache_genid); -EXPORT_SYMBOL(flow_cache_lookup); + return flow_cache_init(&flow_cache_global); +} + +module_init(flow_cache_init_global); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 78e5bfc454a..7c2373321b7 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -32,6 +32,7 @@ #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rbtree.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/gen_stats.h> @@ -81,7 +82,7 @@ struct gen_estimator { struct list_head list; - struct gnet_stats_basic *bstats; + struct gnet_stats_basic_packed *bstats; struct gnet_stats_rate_est *rate_est; spinlock_t *stats_lock; int ewma_log; @@ -106,6 +107,7 @@ static DEFINE_RWLOCK(est_lock); /* Protects against soft lockup during large deletion */ static struct rb_root est_root = RB_ROOT; +static DEFINE_SPINLOCK(est_tree_lock); static void est_timer(unsigned long arg) { @@ -165,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est) } static -struct gen_estimator *gen_find_node(const struct gnet_stats_basic *bstats, +struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est *rate_est) { struct rb_node *p = est_root.rb_node; @@ -200,9 +202,8 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic *bstats, * * Returns 0 on success or a negative error code. * - * NOTE: Called under rtnl_mutex */ -int gen_new_estimator(struct gnet_stats_basic *bstats, +int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt) @@ -231,6 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, est->last_packets = bstats->packets; est->avpps = rate_est->pps<<10; + spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { INIT_LIST_HEAD(&elist[idx].list); setup_timer(&elist[idx].timer, est_timer, idx); @@ -241,6 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, list_add_rcu(&est->list, &elist[idx].list); gen_add_node(est); + spin_unlock_bh(&est_tree_lock); return 0; } @@ -260,23 +263,25 @@ static void __gen_kill_estimator(struct rcu_head *head) * * Removes the rate estimator specified by &bstats and &rate_est. * - * NOTE: Called under rtnl_mutex + * Note : Caller should respect an RCU grace period before freeing stats_lock */ -void gen_kill_estimator(struct gnet_stats_basic *bstats, +void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est) { struct gen_estimator *e; + spin_lock_bh(&est_tree_lock); while ((e = gen_find_node(bstats, rate_est))) { rb_erase(&e->node, &est_root); - write_lock_bh(&est_lock); + write_lock(&est_lock); e->bstats = NULL; - write_unlock_bh(&est_lock); + write_unlock(&est_lock); list_del_rcu(&e->list); call_rcu(&e->e_rcu, __gen_kill_estimator); } + spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); @@ -292,7 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic *bstats, +int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -308,11 +313,17 @@ EXPORT_SYMBOL(gen_replace_estimator); * * Returns true if estimator is active, and false if not. */ -bool gen_estimator_active(const struct gnet_stats_basic *bstats, +bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est *rate_est) { + bool res; + ASSERT_RTNL(); - return gen_find_node(bstats, rate_est) != NULL; + spin_lock_bh(&est_tree_lock); + res = gen_find_node(bstats, rate_est) != NULL; + spin_unlock_bh(&est_tree_lock); + + return res; } EXPORT_SYMBOL(gen_estimator_active); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index c3d0ffeac24..0452eb27a27 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -73,6 +73,7 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, return 0; } +EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode @@ -93,6 +94,7 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, { return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); } +EXPORT_SYMBOL(gnet_stats_start_copy); /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV @@ -106,22 +108,29 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) { if (d->compat_tc_stats) { d->tc_stats.bytes = b->bytes; d->tc_stats.packets = b->packets; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b)); + if (d->tail) { + struct gnet_stats_basic sb; + memset(&sb, 0, sizeof(sb)); + sb.bytes = b->bytes; + sb.packets = b->packets; + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + } return 0; } +EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle + * @b: basic statistics * @r: rate estimator statistics * * Appends the rate estimator statistics to the top level TLV created by @@ -131,8 +140,13 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +gnet_stats_copy_rate_est(struct gnet_dump *d, + const struct gnet_stats_basic_packed *b, + struct gnet_stats_rate_est *r) { + if (b && !gen_estimator_active(b, r)) + return 0; + if (d->compat_tc_stats) { d->tc_stats.bps = r->bps; d->tc_stats.pps = r->pps; @@ -143,6 +157,7 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) return 0; } +EXPORT_SYMBOL(gnet_stats_copy_rate_est); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV @@ -170,6 +185,7 @@ gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) return 0; } +EXPORT_SYMBOL(gnet_stats_copy_queue); /** * gnet_stats_copy_app - copy application specific statistics into statistics TLV @@ -197,6 +213,7 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return 0; } +EXPORT_SYMBOL(gnet_stats_copy_app); /** * gnet_stats_finish_copy - finish dumping procedure @@ -230,12 +247,4 @@ gnet_stats_finish_copy(struct gnet_dump *d) spin_unlock_bh(d->lock); return 0; } - - -EXPORT_SYMBOL(gnet_stats_start_copy); -EXPORT_SYMBOL(gnet_stats_start_copy_compat); -EXPORT_SYMBOL(gnet_stats_copy_basic); -EXPORT_SYMBOL(gnet_stats_copy_rate_est); -EXPORT_SYMBOL(gnet_stats_copy_queue); -EXPORT_SYMBOL(gnet_stats_copy_app); EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c index 16ad45d4882..c40f27e7d20 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/slab.h> #include <linux/net.h> #include <linux/in6.h> #include <asm/uaccess.h> @@ -38,11 +37,13 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, err, ct; + int size, ct, err; if (m->msg_namelen) { if (mode == VERIFY_READ) { - err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, address); if (err < 0) return err; @@ -53,21 +54,20 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, } size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, m->msg_iov, size)) + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) return -EFAULT; m->msg_iov = iov; err = 0; for (ct = 0; ct < m->msg_iovlen; ct++) { - err += iov[ct].iov_len; - /* - * Goal is not to verify user data, but to prevent returning - * negative value, which is interpreted as errno. - * Overflow is still possible, but it is harmless. - */ - if (err < 0) - return -EMSGSIZE; + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; } return err; @@ -96,6 +96,7 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) return 0; } +EXPORT_SYMBOL(memcpy_toiovec); /* * Copy kernel to iovec. Returns -EFAULT on error. @@ -121,6 +122,7 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, return 0; } +EXPORT_SYMBOL(memcpy_toiovecend); /* * Copy iovec to kernel. Returns -EFAULT on error. @@ -145,6 +147,7 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) return 0; } +EXPORT_SYMBOL(memcpy_fromiovec); /* * Copy iovec from kernel. Returns -EFAULT on error. @@ -173,6 +176,7 @@ int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, return 0; } +EXPORT_SYMBOL(memcpy_fromiovecend); /* * And now for the all-in-one: copy and checksum from a user iovec @@ -257,9 +261,4 @@ out_fault: err = -EFAULT; goto out; } - EXPORT_SYMBOL(csum_partial_copy_fromiovecend); -EXPORT_SYMBOL(memcpy_fromiovec); -EXPORT_SYMBOL(memcpy_fromiovecend); -EXPORT_SYMBOL(memcpy_toiovec); -EXPORT_SYMBOL(memcpy_toiovecend); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bf8f7af699d..01a1101b593 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -19,7 +19,6 @@ #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> -#include <linux/slab.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <asm/types.h> @@ -35,7 +34,7 @@ static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); -static struct net_device *lweventlist; +static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) @@ -89,8 +88,10 @@ static void linkwatch_add_event(struct net_device *dev) unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); - dev->link_watch_next = lweventlist; - lweventlist = dev; + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + dev_hold(dev); + } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -133,9 +134,35 @@ static void linkwatch_schedule_work(int urgent) } +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + dev_put(dev); +} + static void __linkwatch_run_queue(int urgent_only) { - struct net_device *next; + struct net_device *dev; + LIST_HEAD(wrk); /* * Limit the number of linkwatch events to one @@ -153,46 +180,40 @@ static void __linkwatch_run_queue(int urgent_only) clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); - next = lweventlist; - lweventlist = NULL; - spin_unlock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); - while (next) { - struct net_device *dev = next; + while (!list_empty(&wrk)) { - next = dev->link_watch_next; + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); if (urgent_only && !linkwatch_urgent_event(dev)) { - linkwatch_add_event(dev); + list_add_tail(&dev->link_watch_list, &lweventlist); continue; } - - /* - * Make sure the above read is complete since it can be - * rewritten as soon as we clear the bit below. - */ - smp_mb__before_clear_bit(); - - /* We are about to handle this device, - * so new events can be accepted - */ - clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); - - rfc2863_policy(dev); - if (dev->flags & IFF_UP) { - if (netif_carrier_ok(dev)) - dev_activate(dev); - else - dev_deactivate(dev); - - netdev_state_change(dev); - } - - dev_put(dev); + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + spin_lock_irq(&lweventlist_lock); } - if (lweventlist) + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); } @@ -216,13 +237,10 @@ void linkwatch_fire_event(struct net_device *dev) bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { - dev_hold(dev); - linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } - EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 163b4f5b036..60a90291342 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -15,6 +15,7 @@ * Harald Welte Add neighbour cache statistics like rtstat */ +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> @@ -40,7 +41,6 @@ #define NEIGH_PRINTK(x...) printk(x) #define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK0 NEIGH_PRINTK #define NEIGH_PRINTK1 NEIGH_NOPRINTK #define NEIGH_PRINTK2 NEIGH_NOPRINTK @@ -121,7 +121,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return (base ? (net_random() % base) + (base >> 1) : 0); + return base ? (net_random() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -130,15 +130,20 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n; + struct neighbour __rcu **np; - np = &tbl->hash_buckets[i]; - while ((n = *np) != NULL) { + np = &nht->hash_buckets[i]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent @@ -146,7 +151,9 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & NUD_PERMANENT)) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; shrunk = 1; write_unlock(&n->lock); @@ -198,16 +205,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list) static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; + struct neigh_hash_table *nht; - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np = &tbl->hash_buckets[i]; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - while ((n = *np) != NULL) { + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; @@ -278,6 +293,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; @@ -296,64 +312,86 @@ out_entries: goto out; } -static struct neighbour **neigh_hash_alloc(unsigned int entries) +static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { - unsigned long size = entries * sizeof(struct neighbour *); - struct neighbour **ret; + size_t size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour **buckets; - if (size <= PAGE_SIZE) { - ret = kzalloc(size, GFP_ATOMIC); - } else { - ret = (struct neighbour **) - __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; } + rcu_assign_pointer(ret->hash_buckets, buckets); + ret->hash_mask = entries - 1; + get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; } -static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +static void neigh_hash_free_rcu(struct rcu_head *head) { - unsigned long size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); + struct neighbour **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) - kfree(hash); + kfree(buckets); else - free_pages((unsigned long)hash, get_order(size)); + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); } -static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_entries) { - struct neighbour **new_hash, **old_hash; - unsigned int i, new_hash_mask, old_entries; + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); BUG_ON(!is_power_of_2(new_entries)); - new_hash = neigh_hash_alloc(new_entries); - if (!new_hash) - return; + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_entries); + if (!new_nht) + return old_nht; - old_entries = tbl->hash_mask + 1; - new_hash_mask = new_entries - 1; - old_hash = tbl->hash_buckets; - - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - for (i = 0; i < old_entries; i++) { + for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_hash[i]; n; n = next) { - unsigned int hash_val = tbl->hash(n->primary_key, n->dev); - - hash_val &= new_hash_mask; - next = n->next; - - n->next = new_hash[hash_val]; - new_hash[hash_val] = n; + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); + + hash &= new_nht->hash_mask; + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } - tbl->hash_buckets = new_hash; - tbl->hash_mask = new_hash_mask; - neigh_hash_free(old_hash, old_entries); + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -362,19 +400,26 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, dev); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup); @@ -385,20 +430,27 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, NULL); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup_nodev); @@ -410,6 +462,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); @@ -436,18 +489,24 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (n->parms->base_reachable_time << 1); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) - neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) + nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); - hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -455,10 +514,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = tbl->hash_buckets[hash_val]; - tbl->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); rc = n; @@ -615,6 +676,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } +static void neigh_destroy_rcu(struct rcu_head *head) +{ + struct neighbour *neigh = container_of(head, struct neighbour, rcu); + + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} /* * neighbour must already be out of the table; * @@ -642,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh) write_seqlock_bh(&hh->hh_lock); hh->hh_output = neigh_blackhole; write_sequnlock_bh(&hh->hh_lock); - if (atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + hh_cache_put(hh); } skb_queue_purge(&neigh->arp_queue); @@ -654,7 +720,7 @@ void neigh_destroy(struct neighbour *neigh) NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); + call_rcu(&neigh->rcu, neigh_destroy_rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -692,86 +758,92 @@ static void neigh_connect(struct neighbour *neigh) hh->hh_output = neigh->ops->hh_output; } -static void neigh_periodic_timer(unsigned long arg) +static void neigh_periodic_work(struct work_struct *work) { - struct neigh_table *tbl = (struct neigh_table *)arg; - struct neighbour *n, **np; - unsigned long expire, now = jiffies; + struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neighbour *n; + struct neighbour __rcu **np; + unsigned int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock(&tbl->lock); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ - if (time_after(now, tbl->last_rand + 300 * HZ)) { + if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = now; + tbl->last_rand = jiffies; for (p = &tbl->parms; p; p = p->next) p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); } - np = &tbl->hash_buckets[tbl->hash_chain_gc]; - tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask); + for (i = 0 ; i <= nht->hash_mask; i++) { + np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { - unsigned int state; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + unsigned int state; - write_lock(&n->lock); + write_lock(&n->lock); - state = n->nud_state; - if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { - write_unlock(&n->lock); - goto next_elt; - } + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } - if (time_before(n->used, n->confirmed)) - n->used = n->confirmed; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 1 && - (state == NUD_FAILED || - time_after(now, n->used + n->parms->gc_staletime))) { - *np = n->next; - n->dead = 1; + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(jiffies, n->used + n->parms->gc_staletime))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } write_unlock(&n->lock); - neigh_cleanup_and_release(n); - continue; - } - write_unlock(&n->lock); next_elt: - np = &n->next; + np = &n->next; + } + /* + * It's fine to release lock here, even if hash table + * grows while we are preempted. + */ + write_unlock_bh(&tbl->lock); + cond_resched(); + write_lock_bh(&tbl->lock); } - /* Cycle through all hash buckets every base_reachable_time/2 ticks. * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 * base_reachable_time. */ - expire = tbl->parms.base_reachable_time >> 1; - expire /= (tbl->hash_mask + 1); - if (!expire) - expire = 1; - - if (expire>HZ) - mod_timer(&tbl->gc_timer, round_jiffies(now + expire)); - else - mod_timer(&tbl->gc_timer, now + expire); - - write_unlock(&tbl->lock); + schedule_delayed_work(&tbl->gc_work, + tbl->parms.base_reachable_time >> 1); + write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE ? + return (n->nud_state & NUD_PROBE) ? p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes); + p->ucast_probes + p->app_probes + p->mcast_probes; } static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) { struct sk_buff *skb; @@ -932,6 +1004,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } + skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); } rc = 1; @@ -942,11 +1015,14 @@ out_unlock_bh: } EXPORT_SYMBOL(__neigh_event_send); -static void neigh_update_hhs(struct neighbour *neigh) +static void neigh_update_hhs(const struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) - = neigh->dev->header_ops->cache_update; + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; if (update) { for (hh = neigh->hh; hh; hh = hh->hh_next) { @@ -1075,7 +1151,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - @@ -1133,44 +1211,73 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, } EXPORT_SYMBOL(neigh_event_ns); +static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + + smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ + for (hh = n->hh; hh; hh = hh->hh_next) { + if (hh->hh_type == protocol) { + atomic_inc(&hh->hh_refcnt); + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); + return true; + } + } + return false; +} + +/* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, __be16 protocol) { struct hh_cache *hh; struct net_device *dev = dst->dev; - for (hh = n->hh; hh; hh = hh->hh_next) - if (hh->hh_type == protocol) - break; + if (likely(neigh_hh_lookup(n, dst, protocol))) + return; - if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { - seqlock_init(&hh->hh_lock); - hh->hh_type = protocol; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; + /* slow path */ + hh = kzalloc(sizeof(*hh), GFP_ATOMIC); + if (!hh) + return; - if (dev->header_ops->cache(n, hh)) { - kfree(hh); - hh = NULL; - } else { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = n->hh; - n->hh = hh; - if (n->nud_state & NUD_CONNECTED) - hh->hh_output = n->ops->hh_output; - else - hh->hh_output = n->ops->output; - } + seqlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 2); + + if (dev->header_ops->cache(n, hh)) { + kfree(hh); + return; } - if (hh) { - atomic_inc(&hh->hh_refcnt); - dst->hh = hh; + + write_lock_bh(&n->lock); + + /* must check if another thread already did the insert */ + if (neigh_hh_lookup(n, dst, protocol)) { + kfree(hh); + goto end; } + + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + + hh->hh_next = n->hh; + smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ + n->hh = hh; + + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); +end: + write_unlock_bh(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit - worked, f.e. if you want to override normal output path (eql, shaper), - but resolution is not made yet. + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. */ int neigh_compat_output(struct sk_buff *skb) @@ -1204,19 +1311,19 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->header_ops->cache && !dst->hh) { - write_lock_bh(&neigh->lock); - if (!dst->hh) - neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - write_unlock_bh(&neigh->lock); - } else { - read_lock_bh(&neigh->lock); + unsigned int seq; + + if (dev->header_ops->cache && + !dst->hh && + !(dst->flags & DST_NOCACHE)) + neigh_hh_init(neigh, dst, dst->ops->protocol); + + do { + seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); - } + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) rc = neigh->ops->queue_xmit(skb); else @@ -1242,13 +1349,16 @@ int neigh_connected_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; + unsigned int seq; __skb_pull(skb, skb_network_offset(skb)); - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) err = neigh->ops->queue_xmit(skb); else { @@ -1316,7 +1426,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, } EXPORT_SYMBOL(pneigh_enqueue); -static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl, +static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; @@ -1337,7 +1447,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; - ref = lookup_neigh_params(tbl, net, 0); + ref = lookup_neigh_parms(tbl, net, 0); if (!ref) return NULL; @@ -1430,22 +1540,17 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->hash_mask = 1; - tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + tbl->nht = neigh_hash_alloc(8); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); - if (!tbl->hash_buckets || !tbl->phash_buckets) + if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - rwlock_init(&tbl->lock); - setup_timer(&tbl->gc_timer, neigh_periodic_timer, (unsigned long)tbl); - tbl->gc_timer.expires = now + 1; - add_timer(&tbl->gc_timer); - + INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); + schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1482,7 +1587,7 @@ int neigh_table_clear(struct neigh_table *tbl) struct neigh_table **tp; /* It is not clean... Fix it to unload IPv6 module safely */ - del_timer_sync(&tbl->gc_timer); + cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); @@ -1497,8 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); - tbl->hash_buckets = NULL; + call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; @@ -1524,6 +1629,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err = -EINVAL; + ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; @@ -1533,7 +1639,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -1549,34 +1655,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(dst_attr) < tbl->key_len) - goto out_dev_put; + goto out; if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; - goto out_dev_put; + goto out; } err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; -out_dev_put: - if (dev) - dev_put(dev); out: return err; } @@ -1590,6 +1693,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err; + ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) goto out; @@ -1600,14 +1704,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) - goto out_dev_put; + goto out; } read_lock(&neigh_tbl_lock); @@ -1621,7 +1725,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out_dev_put; + goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1634,29 +1738,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pn->flags = ndm->ndm_flags; err = 0; } - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; - goto out_dev_put; + goto out; } neigh = __neigh_lookup_errno(tbl, dst, dev); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); - goto out_dev_put; + goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); - goto out_dev_put; + goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) @@ -1669,15 +1773,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } else err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; - -out_dev_put: - if (dev) - dev_put(dev); out: return err; } @@ -1743,19 +1843,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, unsigned long now = jiffies; unsigned int flush_delta = now - tbl->last_flush; unsigned int rand_delta = now - tbl->last_rand; - + struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_hash_rnd = tbl->hash_rnd, - .ndtc_hash_mask = tbl->hash_mask, - .ndtc_hash_chain_gc = tbl->hash_chain_gc, .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_mask = nht->hash_mask; + rcu_read_unlock_bh(); + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); } @@ -1906,7 +2009,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); - p = lookup_neigh_params(tbl, net, ifindex); + p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; @@ -2052,10 +2155,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; - if ((neigh->nud_state & NUD_VALID) && - nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { - read_unlock_bh(&neigh->lock); - goto nla_put_failure; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); @@ -2083,19 +2190,24 @@ static void neigh_update_notify(struct neighbour *neigh) static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { - struct net * net = sock_net(skb->sk); + struct net *net = sock_net(skb->sk); struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (h = 0; h <= tbl->hash_mask; h++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) goto next; @@ -2103,17 +2215,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { - read_unlock_bh(&tbl->lock); rc = -1; goto out; } - next: +next: idx++; } } - read_unlock_bh(&tbl->lock); rc = skb->len; out: + rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; @@ -2146,15 +2257,22 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (chain = 0; chain <= tbl->hash_mask; chain++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); /* avoid resizes */ + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = tbl->hash_buckets[chain]; n; n = n->next) + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); @@ -2163,18 +2281,25 @@ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; + struct neigh_hash_table *nht; - for (chain = 0; chain <= tbl->hash_mask; chain++) { - struct neighbour *n, **np; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain <= nht->hash_mask; chain++) { + struct neighbour *n; + struct neighbour __rcu **np; - np = &tbl->hash_buckets[chain]; - while ((n = *np) != NULL) { + np = &nht->hash_buckets[chain]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; @@ -2192,13 +2317,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { - n = tbl->hash_buckets[bucket]; + for (bucket = 0; bucket <= nht->hash_mask; bucket++) { + n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2215,8 +2340,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) break; if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2233,14 +2358,14 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } - n = n->next; + n = rcu_dereference_bh(n->next); while (1) { while (n) { @@ -2257,17 +2382,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) break; - if (++state->bucket > tbl->hash_mask) + if (++state->bucket > nht->hash_mask) break; - n = tbl->hash_buckets[state->bucket]; + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) @@ -2365,7 +2490,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) - __acquires(tbl->lock) + __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2373,7 +2498,8 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - read_lock_bh(&tbl->lock); + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -2407,12 +2533,9 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) - __releases(tbl->lock) + __releases(rcu_bh) { - struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; - - read_unlock_bh(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -2420,8 +2543,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; if (*pos == 0) @@ -2438,8 +2560,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -2458,8 +2579,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct proc_dir_entry *pde = seq->private; - struct neigh_table *tbl = pde->data; + struct neigh_table *tbl = seq->private; struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { @@ -2504,7 +2624,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - sf->private = PDE(inode); + sf->private = PDE(inode)->data; } return ret; }; @@ -2562,28 +2682,27 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL +#define NEIGH_VARS_MAX 19 + static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[__NET_NEIGH_MAX]; + struct ctl_table neigh_vars[NEIGH_VARS_MAX]; char *dev_name; } neigh_sysctl_template __read_mostly = { .neigh_vars = { { - .ctl_name = NET_NEIGH_MCAST_SOLICIT, .procname = "mcast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_UCAST_SOLICIT, .procname = "ucast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_APP_SOLICIT, .procname = "app_solicit", .maxlen = sizeof(int), .mode = 0644, @@ -2596,38 +2715,30 @@ static struct neigh_sysctl_table { .proc_handler = proc_dointvec_userhz_jiffies, }, { - .ctl_name = NET_NEIGH_REACHABLE_TIME, .procname = "base_reachable_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_DELAY_PROBE_TIME, .procname = "delay_first_probe_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_GC_STALE_TIME, .procname = "gc_stale_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_UNRES_QLEN, .procname = "unres_qlen", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_PROXY_QLEN, .procname = "proxy_qlen", .maxlen = sizeof(int), .mode = 0644, @@ -2652,45 +2763,36 @@ static struct neigh_sysctl_table { .proc_handler = proc_dointvec_userhz_jiffies, }, { - .ctl_name = NET_NEIGH_RETRANS_TIME_MS, .procname = "retrans_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_NEIGH_REACHABLE_TIME_MS, .procname = "base_reachable_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_NEIGH_GC_INTERVAL, .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_NEIGH_GC_THRESH1, .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_GC_THRESH2, .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NEIGH_GC_THRESH3, .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, @@ -2701,8 +2803,7 @@ static struct neigh_sysctl_table { }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, - int p_id, int pdev_id, char *p_name, - proc_handler *handler, ctl_handler *strategy) + char *p_name, proc_handler *handler) { struct neigh_sysctl_table *t; const char *dev_name_source = NULL; @@ -2713,10 +2814,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, #define NEIGH_CTL_PATH_DEV 3 struct ctl_path neigh_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "proto", .ctl_name = 0, }, - { .procname = "neigh", .ctl_name = 0, }, - { .procname = "default", .ctl_name = NET_PROTO_CONF_DEFAULT, }, + { .procname = "net", }, + { .procname = "proto", }, + { .procname = "neigh", }, + { .procname = "default", }, { }, }; @@ -2741,7 +2842,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; - neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex; /* Terminate the table early */ memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); } else { @@ -2753,31 +2853,19 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, } - if (handler || strategy) { + if (handler) { /* RetransTime */ t->neigh_vars[3].proc_handler = handler; - t->neigh_vars[3].strategy = strategy; t->neigh_vars[3].extra1 = dev; - if (!strategy) - t->neigh_vars[3].ctl_name = CTL_UNNUMBERED; /* ReachableTime */ t->neigh_vars[4].proc_handler = handler; - t->neigh_vars[4].strategy = strategy; t->neigh_vars[4].extra1 = dev; - if (!strategy) - t->neigh_vars[4].ctl_name = CTL_UNNUMBERED; /* RetransTime (in milliseconds)*/ t->neigh_vars[12].proc_handler = handler; - t->neigh_vars[12].strategy = strategy; t->neigh_vars[12].extra1 = dev; - if (!strategy) - t->neigh_vars[12].ctl_name = CTL_UNNUMBERED; /* ReachableTime (in milliseconds) */ t->neigh_vars[13].proc_handler = handler; - t->neigh_vars[13].strategy = strategy; t->neigh_vars[13].extra1 = dev; - if (!strategy) - t->neigh_vars[13].ctl_name = CTL_UNNUMBERED; } t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); @@ -2785,9 +2873,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, goto free; neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; - neigh_path[NEIGH_CTL_PATH_NEIGH].ctl_name = pdev_id; neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; - neigh_path[NEIGH_CTL_PATH_PROTO].ctl_name = p_id; t->sysctl_header = register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3994680c08b..e23c01be5a5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -13,10 +13,14 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_arp.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> #include <net/sock.h> +#include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/wireless.h> -#include <net/iw_handler.h> +#include <linux/vmalloc.h> +#include <net/wext.h> #include "net-sysfs.h" @@ -25,6 +29,7 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; +static const char fmt_u64[] = "%llu\n"; static inline int dev_isalive(const struct net_device *dev) { @@ -90,6 +95,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW(dev_id, fmt_hex); +NETDEVICE_SHOW(addr_assign_type, fmt_dec); NETDEVICE_SHOW(addr_len, fmt_dec); NETDEVICE_SHOW(iflink, fmt_dec); NETDEVICE_SHOW(ifindex, fmt_dec); @@ -130,6 +136,48 @@ static ssize_t show_carrier(struct device *dev, return -EINVAL; } +static ssize_t show_speed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_duplex(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); + } + rtnl_unlock(); + return ret; +} + static ssize_t show_dormant(struct device *dev, struct device_attribute *attr, char *buf) { @@ -141,7 +189,7 @@ static ssize_t show_dormant(struct device *dev, return -EINVAL; } -static const char *operstates[] = { +static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", @@ -248,6 +296,7 @@ static ssize_t show_ifalias(struct device *dev, } static struct device_attribute net_class_attributes[] = { + __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), @@ -259,6 +308,8 @@ static struct device_attribute net_class_attributes[] = { __ATTR(address, S_IRUGO, show_address, NULL), __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(speed, S_IRUGO, show_speed, NULL), + __ATTR(duplex, S_IRUGO, show_duplex, NULL), __ATTR(dormant, S_IRUGO, show_dormant, NULL), __ATTR(operstate, S_IRUGO, show_operstate, NULL), __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), @@ -276,14 +327,15 @@ static ssize_t netstat_show(const struct device *d, struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; - WARN_ON(offset > sizeof(struct net_device_stats) || - offset % sizeof(unsigned long) != 0); + WARN_ON(offset > sizeof(struct rtnl_link_stats64) || + offset % sizeof(u64) != 0); read_lock(&dev_base_lock); if (dev_isalive(dev)) { - const struct net_device_stats *stats = dev_get_stats(dev); - ret = sprintf(buf, fmt_ulong, - *(unsigned long *)(((u8 *) stats) + offset)); + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -295,7 +347,7 @@ static ssize_t show_##name(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ - offsetof(struct net_device_stats, name)); \ + offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) @@ -363,18 +415,17 @@ static ssize_t wireless_show(struct device *d, char *buf, char *)) { struct net_device *dev = to_net_dev(d); - const struct iw_statistics *iw = NULL; + const struct iw_statistics *iw; ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + if (!rtnl_trylock()) + return restart_syscall(); if (dev_isalive(dev)) { - if (dev->wireless_handlers && - dev->wireless_handlers->get_wireless_stats) - iw = dev->wireless_handlers->get_wireless_stats(dev); - if (iw != NULL) + iw = get_wireless_stats(dev); + if (iw) ret = (*format)(iw, buf); } - read_unlock(&dev_base_lock); + rtnl_unlock(); return ret; } @@ -422,18 +473,762 @@ static struct attribute_group wireless_group = { .attrs = wireless_attrs, }; #endif - #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_RPS +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_rx_queue_attr(_attr) container_of(_attr, \ + struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +static ssize_t show_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + rcu_read_unlock(); + free_cpumask_var(mask); + return -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static void rps_map_release(struct rcu_head *rcu) +{ + struct rps_map *map = container_of(rcu, struct rps_map, rcu); + + kfree(map); +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } + + spin_lock(&rps_map_lock); + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); + rcu_assign_pointer(queue->rps_map, map); + spin_unlock(&rps_map_lock); + + if (old_map) + call_rcu(&old_map->rcu, rps_map_release); + + free_cpumask_var(mask); + return len; +} + +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned int val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = flow_table->mask + 1; + rcu_read_unlock(); + + return sprintf(buf, "%u\n", val); +} + +static void rps_dev_flow_table_release_work(struct work_struct *work) +{ + struct rps_dev_flow_table *table = container_of(work, + struct rps_dev_flow_table, free_work); + + vfree(table); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + + INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); + schedule_work(&table->free_work); +} + +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + const char *buf, size_t len) +{ + unsigned int count; + char *endp; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + count = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + if (count) { + int i; + + if (count > 1<<30) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } + count = roundup_pow_of_two(count); + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); + if (!table) + return -ENOMEM; + + table->mask = count - 1; + for (i = 0; i < count; i++) + table->flows[i].cpu = RPS_NO_CPU; + } else + table = NULL; + + spin_lock(&rps_dev_flow_lock); + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute = + __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = + __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); + +static struct attribute *rx_queue_default_attrs[] = { + &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, + NULL +}; + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + + + map = rcu_dereference_raw(queue->rps_map); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + call_rcu(&map->rcu, rps_map_release); + } + + flow_table = rcu_dereference_raw(queue->rps_flow_table); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type rx_queue_ktype = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_attrs = rx_queue_default_attrs, +}; + +static int rx_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_rx_queue *queue = net->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} +#endif /* CONFIG_RPS */ + +int +net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_RPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = rx_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_rx[i].kobj); + + return error; +#else + return 0; +#endif +} + +#ifdef CONFIG_XPS +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) container_of(_attr, \ + struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + + +static ssize_t show_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + cpumask_var_t mask; + unsigned long index; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + for_each_possible_cpu(i) { + struct xps_map *map = + rcu_dereference(dev_maps->cpu_map[i]); + if (map) { + int j; + for (j = 0; j < map->len; j++) { + if (map->queues[j] == index) { + cpumask_set_cpu(i, mask); + break; + } + } + } + } + } + rcu_read_unlock(); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + free_cpumask_var(mask); + return -EINVAL; + } + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static void xps_map_release(struct rcu_head *rcu) +{ + struct xps_map *map = container_of(rcu, struct xps_map, rcu); + + kfree(map); +} + +static void xps_dev_maps_release(struct rcu_head *rcu) +{ + struct xps_dev_maps *dev_maps = + container_of(rcu, struct xps_dev_maps, rcu); + + kfree(dev_maps); +} + +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static ssize_t store_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + cpumask_var_t mask; + int err, i, cpu, pos, map_len, alloc_len, need_set; + unsigned long index; + struct xps_map *map, *new_map; + struct xps_dev_maps *dev_maps, *new_dev_maps; + int nonempty = 0; + int numa_node = -2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + new_dev_maps = kzalloc(max_t(unsigned, + XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); + if (!new_dev_maps) { + free_cpumask_var(mask); + return -ENOMEM; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + new_map = map; + if (map) { + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + map_len = map->len; + alloc_len = map->alloc_len; + } else + pos = map_len = alloc_len = 0; + + need_set = cpu_isset(cpu, *mask) && cpu_online(cpu); +#ifdef CONFIG_NUMA + if (need_set) { + if (numa_node == -2) + numa_node = cpu_to_node(cpu); + else if (numa_node != cpu_to_node(cpu)) + numa_node = -1; + } +#endif + if (need_set && pos >= map_len) { + /* Need to add queue to this CPU's map */ + if (map_len >= alloc_len) { + alloc_len = alloc_len ? + 2 * alloc_len : XPS_MIN_MAP_ALLOC; + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + goto error; + new_map->alloc_len = alloc_len; + for (i = 0; i < map_len; i++) + new_map->queues[i] = map->queues[i]; + new_map->len = map_len; + } + new_map->queues[new_map->len++] = index; + } else if (!need_set && pos < map_len) { + /* Need to remove queue from this CPU's map */ + if (map_len > 1) + new_map->queues[pos] = + new_map->queues[--new_map->len]; + else + new_map = NULL; + } + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); + } + + /* Cleanup old maps */ + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) + call_rcu(&map->rcu, xps_map_release); + if (new_dev_maps->cpu_map[cpu]) + nonempty = 1; + } + + if (nonempty) + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + else { + kfree(new_dev_maps); + rcu_assign_pointer(dev->xps_maps, NULL); + } + + if (dev_maps) + call_rcu(&dev_maps->rcu, xps_dev_maps_release); + + netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + NUMA_NO_NODE); + + mutex_unlock(&xps_map_mutex); + + free_cpumask_var(mask); + return len; + +error: + mutex_unlock(&xps_map_mutex); + + if (new_dev_maps) + for_each_possible_cpu(i) + kfree(rcu_dereference_protected( + new_dev_maps->cpu_map[i], + 1)); + kfree(new_dev_maps); + free_cpumask_var(mask); + return -ENOMEM; +} + +static struct netdev_queue_attribute xps_cpus_attribute = + __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); + +static struct attribute *netdev_queue_default_attrs[] = { + &xps_cpus_attribute.attr, + NULL +}; + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + call_rcu(&map->rcu, xps_map_release); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + call_rcu(&dev_maps->rcu, xps_dev_maps_release); + } + } + + mutex_unlock(&xps_map_mutex); + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type netdev_queue_ktype = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_attrs = netdev_queue_default_attrs, +}; + +static int netdev_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_queue *queue = net->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} +#endif /* CONFIG_XPS */ + +int +netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_XPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_tx[i].kobj); + + return error; +#else + return 0; +#endif +} + +static int register_queue_kobjects(struct net_device *net) +{ + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; + +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; +#endif + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + error = net_rx_queue_update_kobjects(net, 0, real_rx); + if (error) + goto error; + rxq = real_rx; + + error = netdev_queue_update_kobjects(net, 0, real_tx); + if (error) + goto error; + txq = real_tx; + + return 0; + +error: + netdev_queue_update_kobjects(net, txq, 0); + net_rx_queue_update_kobjects(net, rxq, 0); + return error; +} + +static void remove_queue_kobjects(struct net_device *net) +{ + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + net_rx_queue_update_kobjects(net, real_rx, 0); + netdev_queue_update_kobjects(net, real_tx, 0); +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) + kset_unregister(net->queues_kset); +#endif +} + +static const void *net_current_ns(void) +{ + return current->nsproxy->net_ns; +} + +static const void *net_initial_ns(void) +{ + return &init_net; +} + +static const void *net_netlink_ns(struct sock *sk) +{ + return sock_net(sk); +} + +struct kobj_ns_type_operations net_ns_type_operations = { + .type = KOBJ_NS_TYPE_NET, + .current_ns = net_current_ns, + .netlink_ns = net_netlink_ns, + .initial_ns = net_initial_ns, +}; +EXPORT_SYMBOL_GPL(net_ns_type_operations); + +static void net_kobj_ns_exit(struct net *net) +{ + kobj_ns_exit(KOBJ_NS_TYPE_NET, net); +} + +static struct pernet_operations kobj_net_ops = { + .exit = net_kobj_ns_exit, +}; + + #ifdef CONFIG_HOTPLUG static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { struct net_device *dev = to_net_dev(d); int retval; - if (!net_eq(dev_net(dev), &init_net)) - return 0; - /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) @@ -463,6 +1258,13 @@ static void netdev_release(struct device *d) kfree((char *)dev - dev->padded); } +static const void *net_namespace(struct device *d) +{ + struct net_device *dev; + dev = container_of(d, struct net_device, dev); + return dev_net(dev); +} + static struct class net_class = { .name = "net", .dev_release = netdev_release, @@ -472,6 +1274,8 @@ static struct class net_class = { #ifdef CONFIG_HOTPLUG .dev_uevent = netdev_uevent, #endif + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, }; /* Delete sysfs entries but hold kobject reference until after all @@ -483,8 +1287,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); - if (dev_net(net) != &init_net) - return; + remove_queue_kobjects(net); device_del(dev); } @@ -493,8 +1296,10 @@ void netdev_unregister_kobject(struct net_device * net) int netdev_register_kobject(struct net_device *net) { struct device *dev = &(net->dev); - struct attribute_group **groups = net->sysfs_groups; + const struct attribute_group **groups = net->sysfs_groups; + int error = 0; + device_initialize(dev); dev->class = &net_class; dev->platform_data = net; dev->groups = groups; @@ -502,40 +1307,49 @@ int netdev_register_kobject(struct net_device *net) dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS - *groups++ = &netstat_group; + /* Allow for a device specific group */ + if (*groups) + groups++; + *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats) + if (net->ieee80211_ptr) *groups++ = &wireless_group; +#ifdef CONFIG_WIRELESS_EXT + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif #endif #endif /* CONFIG_SYSFS */ - if (dev_net(net) != &init_net) - return 0; + error = device_add(dev); + if (error) + return error; - return device_add(dev); + error = register_queue_kobjects(net); + if (error) { + device_del(dev); + return error; + } + + return error; } int netdev_class_create_file(struct class_attribute *class_attr) { return class_create_file(&net_class, class_attr); } +EXPORT_SYMBOL(netdev_class_create_file); void netdev_class_remove_file(struct class_attribute *class_attr) { class_remove_file(&net_class, class_attr); } - -EXPORT_SYMBOL(netdev_class_create_file); EXPORT_SYMBOL(netdev_class_remove_file); -void netdev_initialize_kobject(struct net_device *net) -{ - struct device *device = &(net->dev); - device_initialize(device); -} - int netdev_kobject_init(void) { + kobj_ns_type_register(&net_ns_type_operations); + register_pernet_subsys(&kobj_net_ops); return class_register(&net_class); } diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 14e7524260b..bd7751ec1c4 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,5 +4,8 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); -void netdev_initialize_kobject(struct net_device *); +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); + #endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c index f1e982c508b..7f1bb2aba03 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,12 +19,14 @@ #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> +#include <linux/slab.h> #include <asm/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS #include <trace/events/skb.h> +#include <trace/events/net.h> #include <trace/events/napi.h> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b7292a2719d..3f860261c5e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -6,6 +6,8 @@ #include <linux/delay.h> #include <linux/sched.h> #include <linux/idr.h> +#include <linux/rculist.h> +#include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -25,14 +27,111 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static void net_generic_release(struct rcu_head *rcu) +{ + struct net_generic *ng; + + ng = container_of(rcu, struct net_generic, rcu); + kfree(ng); +} + +static int net_assign_generic(struct net *net, int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(id == 0); + + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; + if (old_ng->len >= id) + goto assign; + + ng = kzalloc(sizeof(struct net_generic) + + id * sizeof(void *), GFP_KERNEL); + if (ng == NULL) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + ng->len = id; + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + + rcu_assign_pointer(net->gen, ng); + call_rcu(&old_ng->rcu, net_generic_release); +assign: + ng->ptr[id - 1] = data; + return 0; +} + +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + int err; + if (ops->id && ops->size) { + void *data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = net_assign_generic(net, *ops->id, data); + if (err) { + kfree(data); + return err; + } + } + if (ops->init) + return ops->init(net); + return 0; +} + +static void ops_free(const struct pernet_operations *ops, struct net *net) +{ + if (ops->id && ops->size) { + int id = *ops->id; + kfree(net_generic(net, id)); + } +} + +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->exit(net); + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + ops_free(ops, net); + } +} + /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net) { /* Must be called with net_mutex held */ - struct pernet_operations *ops; + const struct pernet_operations *ops, *saved_ops; int error = 0; + LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); @@ -41,11 +140,9 @@ static __net_init int setup_net(struct net *net) #endif list_for_each_entry(ops, &pernet_list, list) { - if (ops->init) { - error = ops->init(net); - if (error < 0) - goto out_undo; - } + error = ops_init(ops, net); + if (error < 0) + goto out_undo; } out: return error; @@ -54,10 +151,14 @@ out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - } + list_add(&net->exit_list, &net_exit_list); + saved_ops = ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); rcu_barrier(); goto out; @@ -127,7 +228,7 @@ static struct net *net_create(void) rv = setup_net(net); if (rv == 0) { rtnl_lock(); - list_add_tail(&net->list, &net_namespace_list); + list_add_tail_rcu(&net->list, &net_namespace_list); rtnl_unlock(); } mutex_unlock(&net_mutex); @@ -145,25 +246,45 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return net_create(); } +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + static void cleanup_net(struct work_struct *work) { - struct pernet_operations *ops; - struct net *net; + const struct pernet_operations *ops; + struct net *net, *tmp; + LIST_HEAD(net_kill_list); + LIST_HEAD(net_exit_list); - net = container_of(work, struct net, work); + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); mutex_lock(&net_mutex); /* Don't let anyone else find us. */ rtnl_lock(); - list_del(&net->list); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_del_rcu(&net->list); + list_add_tail(&net->exit_list, &net_exit_list); + } rtnl_unlock(); + /* + * Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so + * the rcu_barrier() below isn't sufficient alone. + */ + synchronize_rcu(); + /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - } + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); mutex_unlock(&net_mutex); @@ -173,14 +294,23 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - net_free(net); + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); + net_free(net); + } } +static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - INIT_WORK(&net->work, cleanup_net); - queue_work(netns_wq, &net->work); + unsigned long flags; + + spin_lock_irqsave(&cleanup_list_lock, flags); + list_add(&net->cleanup_list, &cleanup_list); + spin_unlock_irqrestore(&cleanup_list_lock, flags); + + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); @@ -193,6 +323,26 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) } #endif +struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) { + struct nsproxy *nsproxy; + nsproxy = task_nsproxy(tsk); + if (nsproxy) + net = get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + return net; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_pid); + static int __init net_ns_init(void) { struct net_generic *ng; @@ -219,7 +369,7 @@ static int __init net_ns_init(void) panic("Could not setup the initial network namespace"); rtnl_lock(); - list_add_tail(&init_net.list, &net_namespace_list); + list_add_tail_rcu(&init_net.list, &net_namespace_list); rtnl_unlock(); mutex_unlock(&net_mutex); @@ -230,18 +380,20 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); #ifdef CONFIG_NET_NS -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) { - struct net *net, *undo_net; + struct net *net; int error; + LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init) { + if (ops->init || (ops->id && ops->size)) { for_each_net(net) { - error = ops->init(net); + error = ops_init(ops, net); if (error) goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); } } return 0; @@ -249,45 +401,82 @@ static int register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - if (ops->exit) { - for_each_net(undo_net) { - if (undo_net == net) - goto undone; - ops->exit(undo_net); - } - } -undone: + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); return error; } -static void unregister_pernet_operations(struct pernet_operations *ops) +static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; + LIST_HEAD(net_exit_list); list_del(&ops->list); - if (ops->exit) - for_each_net(net) - ops->exit(net); + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); } #else +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int err = 0; + err = ops_init(ops, &init_net); + if (err) + ops_free(ops, &init_net); + return err; + +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); +} + +#endif /* CONFIG_NET_NS */ + +static DEFINE_IDA(net_generic_ids); + static int register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - if (ops->init == NULL) - return 0; - return ops->init(&init_net); + int error; + + if (ops->id) { +again: + error = ida_get_new_above(&net_generic_ids, 1, ops->id); + if (error < 0) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + return error; + } + } + error = __register_pernet_operations(list, ops); + if (error) { + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); + } + + return error; } static void unregister_pernet_operations(struct pernet_operations *ops) { - if (ops->exit) - ops->exit(&init_net); + + __unregister_pernet_operations(ops); + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); } -#endif - -static DEFINE_IDA(net_generic_ids); /** * register_pernet_subsys - register a network namespace subsystem @@ -327,45 +516,13 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); * addition run the exit method for all existing network * namespaces. */ -void unregister_pernet_subsys(struct pernet_operations *module) -{ - mutex_lock(&net_mutex); - unregister_pernet_operations(module); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_subsys); - -int register_pernet_gen_subsys(int *id, struct pernet_operations *ops) -{ - int rv; - - mutex_lock(&net_mutex); -again: - rv = ida_get_new_above(&net_generic_ids, 1, id); - if (rv < 0) { - if (rv == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - rv = register_pernet_operations(first_device, ops); - if (rv < 0) - ida_remove(&net_generic_ids, *id); -out: - mutex_unlock(&net_mutex); - return rv; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); - -void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops) +void unregister_pernet_subsys(struct pernet_operations *ops) { mutex_lock(&net_mutex); unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); mutex_unlock(&net_mutex); } -EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys); +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); /** * register_pernet_device - register a network namespace device @@ -398,30 +555,6 @@ int register_pernet_device(struct pernet_operations *ops) } EXPORT_SYMBOL_GPL(register_pernet_device); -int register_pernet_gen_device(int *id, struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); -again: - error = ida_get_new_above(&net_generic_ids, 1, id); - if (error) { - if (error == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - error = register_pernet_operations(&pernet_list, ops); - if (error) - ida_remove(&net_generic_ids, *id); - else if (first_device == &pernet_list) - first_device = &ops->list; -out: - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_device); - /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate @@ -440,60 +573,3 @@ void unregister_pernet_device(struct pernet_operations *ops) mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_device); - -void unregister_pernet_gen_device(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - if (&ops->list == first_device) - first_device = first_device->next; - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_device); - -static void net_generic_release(struct rcu_head *rcu) -{ - struct net_generic *ng; - - ng = container_of(rcu, struct net_generic, rcu); - kfree(ng); -} - -int net_assign_generic(struct net *net, int id, void *data) -{ - struct net_generic *ng, *old_ng; - - BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); - - ng = old_ng = net->gen; - if (old_ng->len >= id) - goto assign; - - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); - if (ng == NULL) - return -ENOMEM; - - /* - * Some synchronisation notes: - * - * The net_generic explores the net->gen array inside rcu - * read section. Besides once set the net->gen->ptr[x] - * pointer never changes (see rules in netns/generic.h). - * - * That said, we simply duplicate this array and schedule - * the old copy for kfree after a grace period. - */ - - ng->len = id; - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len); - - rcu_assign_pointer(net->gen, ng); - call_rcu(&old_ng->rcu, net_generic_release); -assign: - ng->ptr[id - 1] = data; - return 0; -} -EXPORT_SYMBOL_GPL(net_assign_generic); diff --git a/net/core/netevent.c b/net/core/netevent.c index 95f81de8750..865f0ceb81f 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -35,6 +35,7 @@ int register_netevent_notifier(struct notifier_block *nb) err = atomic_notifier_chain_register(&netevent_notif_chain, nb); return err; } +EXPORT_SYMBOL_GPL(register_netevent_notifier); /** * netevent_unregister_notifier - unregister a netevent notifier block @@ -50,6 +51,7 @@ int unregister_netevent_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_netevent_notifier); /** * call_netevent_notifiers - call all netevent notifier blocks @@ -64,7 +66,4 @@ int call_netevent_notifiers(unsigned long val, void *v) { return atomic_notifier_call_chain(&netevent_notif_chain, val, v); } - -EXPORT_SYMBOL_GPL(register_netevent_notifier); -EXPORT_SYMBOL_GPL(unregister_netevent_notifier); EXPORT_SYMBOL_GPL(call_netevent_notifiers); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9675f312830..02dc2cbcbe8 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -9,6 +9,7 @@ * Copyright (C) 2002 Red Hat, Inc. */ +#include <linux/moduleparam.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> @@ -21,6 +22,7 @@ #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> +#include <linux/slab.h> #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> @@ -33,7 +35,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 -#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) static struct sk_buff_head skb_pool; @@ -50,6 +51,9 @@ static atomic_t trapped; static void zap_completion_queue(void); static void arp_reply(struct sk_buff *skb); +static unsigned int carrier_timeout = 4; +module_param(carrier_timeout, uint, 0644); + static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = @@ -71,8 +75,7 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq) || + if (netif_tx_queue_frozen_or_stopped(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); @@ -174,9 +177,8 @@ static void service_arp_queue(struct netpoll_info *npi) } } -void netpoll_poll(struct netpoll *np) +void netpoll_poll_dev(struct net_device *dev) { - struct net_device *dev = np->dev; const struct net_device_ops *ops; if (!dev || !netif_running(dev)) @@ -195,6 +197,13 @@ void netpoll_poll(struct netpoll *np) zap_completion_queue(); } +EXPORT_SYMBOL(netpoll_poll_dev); + +void netpoll_poll(struct netpoll *np) +{ + netpoll_poll_dev(np->dev); +} +EXPORT_SYMBOL(netpoll_poll); static void refill_skbs(void) { @@ -277,12 +286,13 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } -static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, + struct net_device *dev) { int status = NETDEV_TX_BUSY; unsigned long tries; - struct net_device *dev = np->dev; const struct net_device_ops *ops = dev->netdev_ops; + /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo = np->dev->npinfo; if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { @@ -303,7 +313,9 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_tx_queue_stopped(txq)) { + dev->priv_flags |= IFF_IN_NETPOLL; status = ops->ndo_start_xmit(skb, dev); + dev->priv_flags &= ~IFF_IN_NETPOLL; if (status == NETDEV_TX_OK) txq_trans_update(txq); } @@ -319,6 +331,11 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) udelay(USEC_PER_POLL); } + + WARN_ONCE(!irqs_disabled(), + "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + dev->name, ops->ndo_start_xmit); + local_irq_restore(flags); } @@ -327,6 +344,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } +EXPORT_SYMBOL(netpoll_send_skb_on_dev); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { @@ -388,6 +406,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) netpoll_send_skb(np, skb); } +EXPORT_SYMBOL(netpoll_send_udp); static void arp_reply(struct sk_buff *skb) { @@ -398,11 +417,24 @@ static void arp_reply(struct sk_buff *skb) __be32 sip, tip; unsigned char *sha; struct sk_buff *send_skb; - struct netpoll *np = NULL; + struct netpoll *np, *tmp; + unsigned long flags; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); - if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) - np = npinfo->rx_np; - if (!np) + /* No netpoll struct is using this dev */ + if (!hits) return; /* No arp on this interface */ @@ -428,77 +460,91 @@ static void arp_reply(struct sk_buff *skb) arp_ptr += skb->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - /* if we actually cared about dst hw addr, it would get copied here */ + /* If we actually cared about dst hw addr, + it would get copied here */ arp_ptr += skb->dev->addr_len; memcpy(&tip, arp_ptr, 4); /* Should we ignore arp? */ - if (tip != np->local_ip || - ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; size = arp_hdr_len(skb->dev); - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); - if (!send_skb) - return; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - return; - } + send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + if (!send_skb) + continue; - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should always equal 1 (Ethernet). - */ + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } - arp_ptr=(unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ - netpoll_send_skb(np, send_skb); + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); } int __netpoll_rx(struct sk_buff *skb) { int proto, len, ulen; + int hits = 0; struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npi = skb->dev->npinfo; - struct netpoll *np = npi->rx_np; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; - if (!np) + if (list_empty(&npinfo->rx_np)) goto out; + if (skb->dev->type != ARPHRD_ETHER) goto out; /* check if netpoll clients need ARP */ if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npi->arp_tx, skb); + skb_queue_tail(&npinfo->arp_tx, skb); return 1; } @@ -542,16 +588,23 @@ int __netpoll_rx(struct sk_buff *skb) goto out; if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; - if (np->local_ip && np->local_ip != iph->daddr) - goto out; - if (np->remote_ip && np->remote_ip != iph->saddr) - goto out; - if (np->local_port && np->local_port != ntohs(uh->dest)) - goto out; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; kfree_skb(skb); return 1; @@ -571,7 +624,7 @@ void netpoll_print_options(struct netpoll *np) np->name, np->local_port); printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip); - printk(KERN_INFO "%s: interface %s\n", + printk(KERN_INFO "%s: interface '%s'\n", np->name, np->dev_name); printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); @@ -580,6 +633,7 @@ void netpoll_print_options(struct netpoll *np) printk(KERN_INFO "%s: remote ethernet address %pM\n", np->name, np->remote_mac); } +EXPORT_SYMBOL(netpoll_print_options); int netpoll_parse_options(struct netpoll *np, char *opt) { @@ -618,6 +672,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; + if (*cur == ' ' || *cur == '\t') + printk(KERN_INFO "%s: warning: whitespace" + "is not allowed\n", np->name); np->remote_port = simple_strtol(cur, NULL, 10); cur = delim; } @@ -665,37 +722,37 @@ int netpoll_parse_options(struct netpoll *np, char *opt) return 0; parse_failed: - printk(KERN_INFO "%s: couldn't parse config at %s!\n", + printk(KERN_INFO "%s: couldn't parse config at '%s'!\n", np->name, cur); return -1; } +EXPORT_SYMBOL(netpoll_parse_options); -int netpoll_setup(struct netpoll *np) +int __netpoll_setup(struct netpoll *np) { - struct net_device *ndev = NULL; - struct in_device *in_dev; + struct net_device *ndev = np->dev; struct netpoll_info *npinfo; + const struct net_device_ops *ops; unsigned long flags; int err; - if (np->dev_name) - ndev = dev_get_by_name(&init_net, np->dev_name); - if (!ndev) { - printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || + !ndev->netdev_ops->ndo_poll_controller) { + printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", np->name, np->dev_name); - return -ENODEV; + err = -ENOTSUPP; + goto out; } - np->dev = ndev; if (!ndev->npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; - goto release; + goto out; } npinfo->rx_flags = 0; - npinfo->rx_np = NULL; + INIT_LIST_HEAD(&npinfo->rx_np); spin_lock_init(&npinfo->rx_lock); skb_queue_head_init(&npinfo->arp_tx); @@ -703,16 +760,51 @@ int netpoll_setup(struct netpoll *np) INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); atomic_set(&npinfo->refcnt, 1); + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_setup) { + err = ops->ndo_netpoll_setup(ndev, npinfo); + if (err) + goto free_npinfo; + } } else { npinfo = ndev->npinfo; atomic_inc(&npinfo->refcnt); } - if (!ndev->netdev_ops->ndo_poll_controller) { - printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", + npinfo->netpoll = np; + + if (np->rx_hook) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + npinfo->rx_flags |= NETPOLL_RX_ENABLED; + list_add_tail(&np->rx, &npinfo->rx_np); + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + + /* last thing to do is link it to the net device structure */ + rcu_assign_pointer(ndev->npinfo, npinfo); + + return 0; + +free_npinfo: + kfree(npinfo); +out: + return err; +} +EXPORT_SYMBOL_GPL(__netpoll_setup); + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + int err; + + if (np->dev_name) + ndev = dev_get_by_name(&init_net, np->dev_name); + if (!ndev) { + printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", np->name, np->dev_name); - err = -ENOTSUPP; - goto release; + return -ENODEV; } if (!netif_running(ndev)) { @@ -728,11 +820,11 @@ int netpoll_setup(struct netpoll *np) if (err) { printk(KERN_ERR "%s: failed to open %s\n", np->name, ndev->name); - goto release; + goto put; } atleast = jiffies + HZ/10; - atmost = jiffies + 4*HZ; + atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { printk(KERN_NOTICE @@ -740,7 +832,7 @@ int netpoll_setup(struct netpoll *np) np->name); break; } - cond_resched(); + msleep(1); } /* If carrier appears to come up instantly, we don't @@ -765,7 +857,7 @@ int netpoll_setup(struct netpoll *np) printk(KERN_ERR "%s: no IP address for %s, aborting\n", np->name, np->dev_name); err = -EDESTADDRREQ; - goto release; + goto put; } np->local_ip = in_dev->ifa_list->ifa_local; @@ -773,31 +865,25 @@ int netpoll_setup(struct netpoll *np) printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip); } - if (np->rx_hook) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_flags |= NETPOLL_RX_ENABLED; - npinfo->rx_np = np; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } + np->dev = ndev; /* fill up the skb queue */ refill_skbs(); - /* last thing to do is link it to the net device structure */ - ndev->npinfo = npinfo; + rtnl_lock(); + err = __netpoll_setup(np); + rtnl_unlock(); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu(); + if (err) + goto put; return 0; - release: - if (!ndev->npinfo) - kfree(npinfo); - np->dev = NULL; +put: dev_put(ndev); return err; } +EXPORT_SYMBOL(netpoll_setup); static int __init netpoll_init(void) { @@ -806,43 +892,65 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); -void netpoll_cleanup(struct netpoll *np) +void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; unsigned long flags; - if (np->dev) { - npinfo = np->dev->npinfo; - if (npinfo) { - if (npinfo->rx_np == np) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_np = NULL; - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } + npinfo = np->dev->npinfo; + if (!npinfo) + return; + + if (!list_empty(&npinfo->rx_np)) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } - if (atomic_dec_and_test(&npinfo->refcnt)) { - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_rearming_delayed_work(&npinfo->tx_work); + if (atomic_dec_and_test(&npinfo->refcnt)) { + const struct net_device_ops *ops; - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - np->dev->npinfo = NULL; - } - } + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_cleanup) + ops->ndo_netpoll_cleanup(np->dev); + + rcu_assign_pointer(np->dev->npinfo, NULL); - dev_put(np->dev); + /* avoid racing with NAPI reading npinfo */ + synchronize_rcu_bh(); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + cancel_delayed_work_sync(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + kfree(npinfo); } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); +void netpoll_cleanup(struct netpoll *np) +{ + if (!np->dev) + return; + + rtnl_lock(); + __netpoll_cleanup(np); + rtnl_unlock(); + + dev_put(np->dev); np->dev = NULL; } +EXPORT_SYMBOL(netpoll_cleanup); int netpoll_trap(void) { return atomic_read(&trapped); } +EXPORT_SYMBOL(netpoll_trap); void netpoll_set_trap(int trap) { @@ -851,12 +959,4 @@ void netpoll_set_trap(int trap) else atomic_dec(&trapped); } - EXPORT_SYMBOL(netpoll_set_trap); -EXPORT_SYMBOL(netpoll_trap); -EXPORT_SYMBOL(netpoll_print_options); -EXPORT_SYMBOL(netpoll_parse_options); -EXPORT_SYMBOL(netpoll_setup); -EXPORT_SYMBOL(netpoll_cleanup); -EXPORT_SYMBOL(netpoll_send_udp); -EXPORT_SYMBOL(netpoll_poll); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 19b8c20e98a..a9e7fc4c461 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -115,6 +115,9 @@ * command by Adit Ranadive <adit.262@gmail.com> * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sys.h> #include <linux/types.h> #include <linux/module.h> @@ -131,6 +134,7 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/capability.h> +#include <linux/hrtimer.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/timer.h> @@ -162,18 +166,19 @@ #include <asm/byteorder.h> #include <linux/rcupdate.h> #include <linux/bitops.h> -#include <asm/io.h> +#include <linux/io.h> +#include <linux/timex.h> +#include <linux/uaccess.h> #include <asm/dma.h> -#include <asm/uaccess.h> #include <asm/div64.h> /* do_div */ -#include <asm/timex.h> - -#define VERSION "pktgen v2.70: Packet Generator for packet performance testing.\n" +#define VERSION "2.74" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) +#define func_enter() pr_debug("entering %s\n", __func__); + /* Device flag bits */ #define F_IPSRC_RND (1<<0) /* IP-Src Random */ #define F_IPDST_RND (1<<1) /* IP-Dst Random */ @@ -190,13 +195,13 @@ #define F_IPSEC_ON (1<<12) /* ipsec on for flows */ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ +#define F_NODE (1<<15) /* Node memory alloc*/ /* Thread control flag bits */ -#define T_TERMINATE (1<<0) -#define T_STOP (1<<1) /* Stop run */ -#define T_RUN (1<<2) /* Start run */ -#define T_REMDEVALL (1<<3) /* Remove all devs */ -#define T_REMDEV (1<<4) /* Remove one dev */ +#define T_STOP (1<<0) /* Stop run */ +#define T_RUN (1<<1) /* Start run */ +#define T_REMDEVALL (1<<2) /* Remove all devs */ +#define T_REMDEV (1<<3) /* Remove one dev */ /* If lock -- can be removed after some work */ #define if_lock(t) spin_lock(&(t->if_lock)); @@ -206,7 +211,7 @@ #define PKTGEN_MAGIC 0xbe9be955 #define PG_PROC_DIR "pktgen" #define PGCTRL "pgctrl" -static struct proc_dir_entry *pg_proc_dir = NULL; +static struct proc_dir_entry *pg_proc_dir; #define MAX_CFLOWS 65536 @@ -231,9 +236,9 @@ struct pktgen_dev { */ struct proc_dir_entry *entry; /* proc file */ struct pktgen_thread *pg_thread;/* the owner */ - struct list_head list; /* Used for chaining in the thread's run-queue */ + struct list_head list; /* chaining in the thread's run-queue */ - int running; /* if this changes to false, the test will stop */ + int running; /* if false, the test will stop */ /* If min != max, then we will either do a linear iteration, or * we will do a random selection from within the range. @@ -246,33 +251,36 @@ struct pktgen_dev { int max_pkt_size; /* = ETH_ZLEN; */ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; - __u32 delay_us; /* Default delay */ - __u32 delay_ns; + u64 delay; /* nano-seconds */ + __u64 count; /* Default No packets to send */ __u64 sofar; /* How many pkts we've sent so far */ __u64 tx_bytes; /* How many bytes we've transmitted */ - __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + __u64 errors; /* Errors when trying to transmit, */ /* runtime counters relating to clone_skb */ - __u64 next_tx_us; /* timestamp of when to tx next */ - __u32 next_tx_ns; __u64 allocated_skbs; __u32 clone_count; int last_ok; /* Was last skb sent? - * Or a failed transmit of some sort? This will keep - * sequence numbers in order, for example. + * Or a failed transmit of some sort? + * This will keep sequence numbers in order */ - __u64 started_at; /* micro-seconds */ - __u64 stopped_at; /* micro-seconds */ - __u64 idle_acc; /* micro-seconds */ + ktime_t next_tx; + ktime_t started_at; + ktime_t stopped_at; + u64 idle_acc; /* nano-seconds */ + __u32 seq_num; - int clone_skb; /* Use multiple SKBs during packet gen. If this number - * is greater than 1, then that many copies of the same - * packet will be sent before a new packet is allocated. - * For instance, if you want to send 1024 identical packets - * before creating a new packet, set clone_skb to 1024. + int clone_skb; /* + * Use multiple SKBs during packet gen. + * If this number is greater than 1, then + * that many copies of the same packet will be + * sent before a new packet is allocated. + * If you want to send 1024 identical packets + * before creating a new packet, + * set clone_skb to 1024. */ char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ @@ -304,8 +312,10 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + (see RFC 3260, sec. 4) */ /* MPLS */ unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ @@ -330,10 +340,12 @@ struct pktgen_dev { __u32 cur_src_mac_offset; __be32 cur_saddr; __be32 cur_daddr; + __u16 ip_id; __u16 cur_udp_dst; __u16 cur_udp_src; __u16 cur_queue_map; __u32 cur_pkt_size; + __u32 last_pkt_size; __u8 hh[14]; /* = { @@ -346,15 +358,18 @@ struct pktgen_dev { */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ - struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we + struct sk_buff *skb; /* skb we are to transmit next, used for when we * are transmitting the same one multiple times */ - struct net_device *odev; /* The out-going device. Note that the device should - * have it's pg_info pointer pointing back to this - * device. This will be set when the user specifies - * the out-going device name (not when the inject is - * started as it used to do.) - */ + struct net_device *odev; /* The out-going device. + * Note that the device should have it's + * pg_info pointer pointing back to this + * device. + * Set when the user specifies the out-going + * device name (not when the inject is + * started as it used to do.) + */ + char odevname[32]; struct flow_state *flows; unsigned cflows; /* Concurrent flows (config) */ unsigned lflow; /* Flow length (config) */ @@ -363,6 +378,8 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; + __u32 skb_priority; /* skb priority field */ + int node; /* Memory node */ #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ @@ -378,14 +395,17 @@ struct pktgen_hdr { __be32 tv_usec; }; +static bool pktgen_exiting __read_mostly; + struct pktgen_thread { - spinlock_t if_lock; + spinlock_t if_lock; /* for list of devices */ struct list_head if_list; /* All device here */ struct list_head th_list; struct task_struct *tsk; char result[512]; - /* Field for thread to receive "posted" events terminate, stop ifs etc. */ + /* Field for thread to receive "posted" events terminate, + stop ifs etc. */ u32 control; int cpu; @@ -397,34 +417,33 @@ struct pktgen_thread { #define REMOVE 1 #define FIND 0 -/** Convert to micro-seconds */ -static inline __u64 tv_to_us(const struct timeval *tv) +static inline ktime_t ktime_now(void) { - __u64 us = tv->tv_usec; - us += (__u64) tv->tv_sec * (__u64) 1000000; - return us; + struct timespec ts; + ktime_get_ts(&ts); + + return timespec_to_ktime(ts); } -static __u64 getCurUs(void) +/* This works even if 32 bit because of careful byte order choice */ +static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) { - struct timeval tv; - do_gettimeofday(&tv); - return tv_to_us(&tv); + return cmp1.tv64 < cmp2.tv64; } -/* old include end */ - -static char version[] __initdata = VERSION; +static const char version[] = + "Packet Generator for packet performance testing. " + "Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, - const char *ifname); + const char *ifname, bool exact); static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(void); static void pktgen_reset_all_threads(void); static void pktgen_stop_all_threads_ifs(void); -static int pktgen_stop_device(struct pktgen_dev *pkt_dev); + static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -432,10 +451,10 @@ static unsigned int scan_ip6(const char *s, char ip[16]); static unsigned int fmt_ip6(char *s, const char ip[16]); /* Module parameters, defaults. */ -static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d; -static int pg_clone_skb_d; -static int debug; +static int pg_count_d __read_mostly = 1000; +static int pg_delay_d __read_mostly; +static int pg_clone_skb_d __read_mostly; +static int debug __read_mostly; static DEFINE_MUTEX(pktgen_thread_lock); static LIST_HEAD(pktgen_threads); @@ -451,12 +470,12 @@ static struct notifier_block pktgen_notifier_block = { static int pgctrl_show(struct seq_file *seq, void *v) { - seq_puts(seq, VERSION); + seq_puts(seq, version); return 0; } -static ssize_t pgctrl_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) +static ssize_t pgctrl_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { int err = 0; char data[128]; @@ -485,7 +504,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user * buf, pktgen_reset_all_threads(); else - printk(KERN_WARNING "pktgen: Unknown command: %s\n", data); + pr_warning("Unknown command: %s\n", data); err = count; @@ -509,10 +528,9 @@ static const struct file_operations pktgen_fops = { static int pktgen_if_show(struct seq_file *seq, void *v) { - struct pktgen_dev *pkt_dev = seq->private; - __u64 sa; - __u64 stopped; - __u64 now = getCurUs(); + const struct pktgen_dev *pkt_dev = seq->private; + ktime_t stopped; + u64 idle; seq_printf(seq, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", @@ -520,10 +538,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->max_pkt_size); seq_printf(seq, - " frags: %d delay: %u clone_skb: %d ifname: %s\n", - pkt_dev->nfrags, - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns, - pkt_dev->clone_skb, pkt_dev->odev->name); + " frags: %d delay: %llu clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, + pkt_dev->clone_skb, pkt_dev->odevname); seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); @@ -533,6 +550,10 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->queue_map_min, pkt_dev->queue_map_max); + if (pkt_dev->skb_priority) + seq_printf(seq, " skb_priority: %u\n", + pkt_dev->skb_priority); + if (pkt_dev->flags & F_IPV6) { char b1[128], b2[128], b3[128]; fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); @@ -549,11 +570,14 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3); - } else + } else { + seq_printf(seq, + " dst_min: %s dst_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max); seq_printf(seq, - " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n", - pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, - pkt_dev->src_max); + " src_min: %s src_max: %s\n", + pkt_dev->src_min, pkt_dev->src_max); + } seq_puts(seq, " src_mac: "); @@ -565,7 +589,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d" + " udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -581,23 +606,24 @@ static int pktgen_if_show(struct seq_file *seq, void *v) i == pkt_dev->nr_labels-1 ? "\n" : ", "); } - if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->vlan_id != 0xffff) seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n", - pkt_dev->vlan_id, pkt_dev->vlan_p, pkt_dev->vlan_cfi); - } + pkt_dev->vlan_id, pkt_dev->vlan_p, + pkt_dev->vlan_cfi); - if (pkt_dev->svlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n", - pkt_dev->svlan_id, pkt_dev->svlan_p, pkt_dev->svlan_cfi); - } + pkt_dev->svlan_id, pkt_dev->svlan_p, + pkt_dev->svlan_cfi); - if (pkt_dev->tos) { + if (pkt_dev->tos) seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos); - } - if (pkt_dev->traffic_class) { + if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); - } + + if (pkt_dev->node >= 0) + seq_printf(seq, " node: %d\n", pkt_dev->node); seq_printf(seq, " Flags: "); @@ -652,19 +678,26 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_SVID_RND) seq_printf(seq, "SVID_RND "); + if (pkt_dev->flags & F_NODE) + seq_printf(seq, "NODE_ALLOC "); + seq_puts(seq, "\n"); - sa = pkt_dev->started_at; - stopped = pkt_dev->stopped_at; - if (pkt_dev->running) - stopped = now; /* not really stopped, more like last-running-at */ + /* not really stopped, more like last-running-at */ + stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; + idle = pkt_dev->idle_acc; + do_div(idle, NSEC_PER_USEC); seq_printf(seq, - "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + "Current:\n pkts-sofar: %llu errors: %llu\n", (unsigned long long)pkt_dev->sofar, - (unsigned long long)pkt_dev->errors, (unsigned long long)sa, - (unsigned long long)stopped, - (unsigned long long)pkt_dev->idle_acc); + (unsigned long long)pkt_dev->errors); + + seq_printf(seq, + " started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) ktime_to_us(pkt_dev->started_at), + (unsigned long long) ktime_to_us(stopped), + (unsigned long long) idle); seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", @@ -696,22 +729,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v) } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, __u32 *num) +static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, + __u32 *num) { int i = 0; *num = 0; for (; i < maxlen; i++) { + int value; char c; *num <<= 4; if (get_user(c, &user_buffer[i])) return -EFAULT; - if ((c >= '0') && (c <= '9')) - *num |= c - '0'; - else if ((c >= 'a') && (c <= 'f')) - *num |= c - 'a' + 10; - else if ((c >= 'A') && (c <= 'F')) - *num |= c - 'A' + 10; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; else break; } @@ -746,10 +778,10 @@ done: static unsigned long num_arg(const char __user * user_buffer, unsigned long maxlen, unsigned long *num) { - int i = 0; + int i; *num = 0; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -764,9 +796,9 @@ static unsigned long num_arg(const char __user * user_buffer, static int strn_len(const char __user * user_buffer, unsigned int maxlen) { - int i = 0; + int i; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -819,9 +851,9 @@ static ssize_t pktgen_if_write(struct file *file, const char __user * user_buffer, size_t count, loff_t * offset) { - struct seq_file *seq = (struct seq_file *)file->private_data; + struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i = 0, max, len; + int i, max, len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; @@ -831,24 +863,24 @@ static ssize_t pktgen_if_write(struct file *file, pg_result = &(pkt_dev->result[0]); if (count < 1) { - printk(KERN_WARNING "pktgen: wrong command format\n"); + pr_warning("wrong command format\n"); return -EINVAL; } - max = count - i; - tmp = count_trail_chars(&user_buffer[i], max); + max = count; + tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { - printk(KERN_WARNING "pktgen: illegal format\n"); + pr_warning("illegal format\n"); return tmp; } - i += tmp; + i = tmp; /* Read variable name */ len = strn_len(&user_buffer[i], sizeof(name) - 1); - if (len < 0) { + if (len < 0) return len; - } + memset(name, 0, sizeof(name)); if (copy_from_user(name, &user_buffer[i], len)) return -EFAULT; @@ -862,19 +894,20 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - char tb[count + 1]; - if (copy_from_user(tb, user_buffer, count)) + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; - tb[count] = 0; + tb[copy] = 0; printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, (unsigned long)count, tb); } if (!strcmp(name, "min_pkt_size")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; @@ -889,9 +922,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "max_pkt_size")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; @@ -908,9 +941,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "pkt_size")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; @@ -925,9 +958,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "debug")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); @@ -936,9 +969,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "frags")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->nfrags = value; sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); @@ -946,26 +979,54 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "delay")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; - if (value == 0x7FFFFFFF) { - pkt_dev->delay_us = 0x7FFFFFFF; - pkt_dev->delay_ns = 0; - } else { - pkt_dev->delay_us = value / 1000; - pkt_dev->delay_ns = value % 1000; - } - sprintf(pg_result, "OK: delay=%u", - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns); + if (value == 0x7FFFFFFF) + pkt_dev->delay = ULLONG_MAX; + else + pkt_dev->delay = (u64)value; + + sprintf(pg_result, "OK: delay=%llu", + (unsigned long long) pkt_dev->delay); + return count; + } + if (!strcmp(name, "rate")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "ratep")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = NSEC_PER_SEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); return count; } if (!strcmp(name, "udp_src_min")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; @@ -976,9 +1037,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "udp_dst_min")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; @@ -989,9 +1050,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "udp_src_max")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; @@ -1002,9 +1063,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "udp_dst_max")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; @@ -1015,9 +1076,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "clone_skb")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->clone_skb = value; @@ -1026,9 +1087,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "count")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", @@ -1037,9 +1098,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "src_mac_count")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; @@ -1051,9 +1112,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "dst_mac_count")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; @@ -1063,13 +1124,28 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } + if (!strcmp(name, "node")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + + if (node_possible(value)) { + pkt_dev->node = value; + sprintf(pg_result, "OK: node=%d", pkt_dev->node); + } + else + sprintf(pg_result, "ERROR: node not possible"); + return count; + } if (!strcmp(name, "flag")) { char f[32]; memset(f, 0, 32); len = strn_len(&user_buffer[i], sizeof(f) - 1); - if (len < 0) { + if (len < 0) return len; - } + if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; i += len; @@ -1155,12 +1231,18 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!IPV6") == 0) pkt_dev->flags &= ~F_IPV6; + else if (strcmp(f, "NODE_ALLOC") == 0) + pkt_dev->flags |= F_NODE; + + else if (strcmp(f, "!NODE_ALLOC") == 0) + pkt_dev->flags &= ~F_NODE; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n"); + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); return count; } sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -1168,9 +1250,8 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); - if (len < 0) { + if (len < 0) return len; - } if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; @@ -1190,9 +1271,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "dst_max")) { len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); - if (len < 0) { + if (len < 0) return len; - } + if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; @@ -1303,9 +1384,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "src_min")) { len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); - if (len < 0) { + if (len < 0) return len; - } + if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; buf[len] = 0; @@ -1324,9 +1405,9 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "src_max")) { len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); - if (len < 0) { + if (len < 0) return len; - } + if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; buf[len] = 0; @@ -1350,27 +1431,21 @@ static ssize_t pktgen_if_write(struct file *file, memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN); len = strn_len(&user_buffer[i], sizeof(valstr) - 1); - if (len < 0) { + if (len < 0) return len; - } + memset(valstr, 0, sizeof(valstr)); if (copy_from_user(valstr, &user_buffer[i], len)) return -EFAULT; i += len; for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) { - if (*v >= '0' && *v <= '9') { - *m *= 16; - *m += *v - '0'; - } - if (*v >= 'A' && *v <= 'F') { - *m *= 16; - *m += *v - 'A' + 10; - } - if (*v >= 'a' && *v <= 'f') { - *m *= 16; - *m += *v - 'a' + 10; - } + int value; + + value = hex_to_bin(*v); + if (value >= 0) + *m = *m * 16 + value; + if (*v == ':') { m++; *m = 0; @@ -1392,27 +1467,21 @@ static ssize_t pktgen_if_write(struct file *file, memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN); len = strn_len(&user_buffer[i], sizeof(valstr) - 1); - if (len < 0) { + if (len < 0) return len; - } + memset(valstr, 0, sizeof(valstr)); if (copy_from_user(valstr, &user_buffer[i], len)) return -EFAULT; i += len; for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) { - if (*v >= '0' && *v <= '9') { - *m *= 16; - *m += *v - '0'; - } - if (*v >= 'A' && *v <= 'F') { - *m *= 16; - *m += *v - 'A' + 10; - } - if (*v >= 'a' && *v <= 'f') { - *m *= 16; - *m += *v - 'a' + 10; - } + int value; + + value = hex_to_bin(*v); + if (value >= 0) + *m = *m * 16 + value; + if (*v == ':') { m++; *m = 0; @@ -1435,9 +1504,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "flows")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1449,9 +1518,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "flowlen")) { len = num_arg(&user_buffer[i], 10, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); @@ -1460,9 +1529,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "queue_map_min")) { len = num_arg(&user_buffer[i], 5, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); @@ -1471,9 +1540,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "queue_map_max")) { len = num_arg(&user_buffer[i], 5, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); @@ -1505,9 +1574,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "vlan_id")) { len = num_arg(&user_buffer[i], 4, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1532,9 +1601,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "vlan_p")) { len = num_arg(&user_buffer[i], 1, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; @@ -1547,9 +1616,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "vlan_cfi")) { len = num_arg(&user_buffer[i], 1, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; @@ -1562,9 +1631,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "svlan_id")) { len = num_arg(&user_buffer[i], 4, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1589,9 +1658,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "svlan_p")) { len = num_arg(&user_buffer[i], 1, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; @@ -1604,9 +1673,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "svlan_cfi")) { len = num_arg(&user_buffer[i], 1, &value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; @@ -1620,9 +1689,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "tos")) { __u32 tmp_value = 0; len = hex32_arg(&user_buffer[i], 2, &tmp_value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (len == 2) { pkt_dev->tos = tmp_value; @@ -1636,9 +1705,9 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "traffic_class")) { __u32 tmp_value = 0; len = hex32_arg(&user_buffer[i], 2, &tmp_value); - if (len < 0) { + if (len < 0) return len; - } + i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; @@ -1649,6 +1718,18 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "skb_priority")) { + len = num_arg(&user_buffer[i], 9, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->skb_priority = value; + sprintf(pg_result, "OK: skb_priority=%i", + pkt_dev->skb_priority); + return count; + } + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); return -EINVAL; } @@ -1670,7 +1751,7 @@ static const struct file_operations pktgen_if_fops = { static int pktgen_thread_show(struct seq_file *seq, void *v) { struct pktgen_thread *t = seq->private; - struct pktgen_dev *pkt_dev; + const struct pktgen_dev *pkt_dev; BUG_ON(!t); @@ -1679,13 +1760,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) - seq_printf(seq, "%s ", pkt_dev->odev->name); + seq_printf(seq, "%s ", pkt_dev->odevname); seq_printf(seq, "\nStopped: "); list_for_each_entry(pkt_dev, &t->if_list, list) if (!pkt_dev->running) - seq_printf(seq, "%s ", pkt_dev->odev->name); + seq_printf(seq, "%s ", pkt_dev->odevname); if (t->result[0]) seq_printf(seq, "\nResult: %s\n", t->result); @@ -1701,9 +1782,9 @@ static ssize_t pktgen_thread_write(struct file *file, const char __user * user_buffer, size_t count, loff_t * offset) { - struct seq_file *seq = (struct seq_file *)file->private_data; + struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i = 0, max, len, ret; + int i, max, len, ret; char name[40]; char *pg_result; @@ -1712,12 +1793,12 @@ static ssize_t pktgen_thread_write(struct file *file, return -EINVAL; } - max = count - i; - len = count_trail_chars(&user_buffer[i], max); + max = count; + len = count_trail_chars(user_buffer, max); if (len < 0) return len; - i += len; + i = len; /* Read variable name */ @@ -1742,7 +1823,7 @@ static ssize_t pktgen_thread_write(struct file *file, name, (unsigned long)count); if (!t) { - printk(KERN_ERR "pktgen: ERROR: No thread\n"); + pr_err("ERROR: No thread\n"); ret = -EINVAL; goto out; } @@ -1808,9 +1889,10 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) { struct pktgen_thread *t; struct pktgen_dev *pkt_dev = NULL; + bool exact = (remove == FIND); list_for_each_entry(t, &pktgen_threads, th_list) { - pkt_dev = pktgen_find_dev(t, ifname); + pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { if_lock(t); @@ -1834,7 +1916,7 @@ static void pktgen_mark_device(const char *ifname) int i = 0; mutex_lock(&pktgen_thread_lock); - pr_debug("pktgen: pktgen_mark_device marking %s for removal\n", ifname); + pr_debug("%s: marking %s for removal\n", __func__, ifname); while (1) { @@ -1843,15 +1925,14 @@ static void pktgen_mark_device(const char *ifname) break; /* success */ mutex_unlock(&pktgen_thread_lock); - pr_debug("pktgen: pktgen_mark_device waiting for %s " - "to disappear....\n", ifname); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); mutex_lock(&pktgen_thread_lock); if (++i >= max_tries) { - printk(KERN_ERR "pktgen_mark_device: timed out after " - "waiting %d msec for device %s to be removed\n", - msec_per_try * i, ifname); + pr_err("%s: timed out after waiting %d msec for device %s to be removed\n", + __func__, msec_per_try * i, ifname); break; } @@ -1873,11 +1954,13 @@ static void pktgen_change_name(struct net_device *dev) remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); - pkt_dev->entry = create_proc_entry(dev->name, 0600, - pg_proc_dir); + pkt_dev->entry = proc_create_data(dev->name, 0600, + pg_proc_dir, + &pktgen_if_fops, + pkt_dev); if (!pkt_dev->entry) - printk(KERN_ERR "pktgen: can't move proc " - " entry for '%s'\n", dev->name); + pr_err("can't move proc entry for '%s'\n", + dev->name); break; } } @@ -1908,13 +1991,14 @@ static int pktgen_device_event(struct notifier_block *unused, return NOTIFY_DONE; } -static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, const char *ifname) +static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, + const char *ifname) { char b[IFNAMSIZ+5]; - int i = 0; + int i; - for(i=0; ifname[i] != '@'; i++) { - if(i == IFNAMSIZ) + for (i = 0; ifname[i] != '@'; i++) { + if (i == IFNAMSIZ) break; b[i] = ifname[i]; @@ -1940,15 +2024,15 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) odev = pktgen_dev_get_by_name(pkt_dev, ifname); if (!odev) { - printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname); + pr_err("no such netdevice: \"%s\"\n", ifname); return -ENODEV; } if (odev->type != ARPHRD_ETHER) { - printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", ifname); + pr_err("not an ethernet device: \"%s\"\n", ifname); err = -EINVAL; } else if (!netif_running(odev)) { - printk(KERN_ERR "pktgen: device is down: \"%s\"\n", ifname); + pr_err("device is down: \"%s\"\n", ifname); err = -ENETDOWN; } else { pkt_dev->odev = odev; @@ -1967,8 +2051,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) int ntxq; if (!pkt_dev->odev) { - printk(KERN_ERR "pktgen: ERROR: pkt_dev->odev == NULL in " - "setup_inject.\n"); + pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n"); sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); return; @@ -1978,19 +2061,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ntxq = pkt_dev->odev->real_num_tx_queues; if (ntxq <= pkt_dev->queue_map_min) { - printk(KERN_WARNING "pktgen: WARNING: Requested " - "queue_map_min (zero-based) (%d) exceeds valid range " - "[0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_min, (ntxq ?: 1)- 1, ntxq, - pkt_dev->odev->name); + pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_min = ntxq - 1; } if (pkt_dev->queue_map_max >= ntxq) { - printk(KERN_WARNING "pktgen: WARNING: Requested " - "queue_map_max (zero-based) (%d) exceeds valid range " - "[0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_max, (ntxq ?: 1)- 1, ntxq, - pkt_dev->odev->name); + pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_max = ntxq - 1; } @@ -2030,15 +2109,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) */ rcu_read_lock(); - if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) { + idev = __in6_dev_get(pkt_dev->odev); + if (idev) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { - if (ifp->scope == IFA_LINK - && !(ifp-> - flags & IFA_F_TENTATIVE)) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & IFA_F_TENTATIVE)) { ipv6_addr_copy(&pkt_dev-> cur_in6_saddr, &ifp->addr); @@ -2050,8 +2129,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) } rcu_read_unlock(); if (err) - printk(KERN_ERR "pktgen: ERROR: IPv6 link " - "address not availble.\n"); + pr_err("ERROR: IPv6 link address not available\n"); } #endif } else { @@ -2089,27 +2167,45 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->nflows = 0; } -static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) + +static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) { - __u64 start; - __u64 now; + ktime_t start_time, end_time; + s64 remaining; + struct hrtimer_sleeper t; - start = now = getCurUs(); - while (now < spin_until_us) { - /* TODO: optimize sleeping behavior */ - if (spin_until_us - now > jiffies_to_usecs(1) + 1) - schedule_timeout_interruptible(1); - else if (spin_until_us - now > 100) { - if (!pkt_dev->running) - return; - if (need_resched()) + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) { + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + return; + } + + start_time = ktime_now(); + if (remaining < 100000) + ndelay(remaining); /* really small just spin */ + else { + /* see do_nanosleep */ + hrtimer_init_sleeper(&t, current); + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) schedule(); - } - now = getCurUs(); + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); } + end_time = ktime_now(); - pkt_dev->idle_acc += now - start; + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) @@ -2120,13 +2216,9 @@ static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); } -static inline int f_seen(struct pktgen_dev *pkt_dev, int flow) +static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow) { - - if (pkt_dev->flows[flow].flags & F_INIT) - return 1; - else - return 0; + return !!(pkt_dev->flows[flow].flags & F_INIT); } static inline int f_pick(struct pktgen_dev *pkt_dev) @@ -2160,12 +2252,13 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... */ +#define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { struct xfrm_state *x = pkt_dev->flows[flow].x; if (!x) { /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(&init_net, + x = xfrm_stateonly_find(&init_net, DUMMY_MARK, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, @@ -2174,7 +2267,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) if (x) { pkt_dev->flows[flow].x = x; set_pkt_overhead(pkt_dev); - pkt_dev->pkt_overhead+=x->props.header_len; + pkt_dev->pkt_overhead += x->props.header_len; } } @@ -2186,7 +2279,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); - else if (pkt_dev->queue_map_min < pkt_dev->queue_map_max) { + else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = random32() % @@ -2313,18 +2406,18 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (!(pkt_dev->flags & F_IPV6)) { - if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = - ntohl(pkt_dev-> - saddr_max))) { + imn = ntohl(pkt_dev->saddr_min); + imx = ntohl(pkt_dev->saddr_max); + if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) t = random32() % (imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; - if (t > imx) { + if (t > imx) t = imn; - } + } pkt_dev->cur_saddr = htonl(t); } @@ -2435,19 +2528,19 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (err) goto error; - x->curlft.bytes +=skb->len; + x->curlft.bytes += skb->len; x->curlft.packets++; error: spin_unlock(&x->lock); return err; } -static inline void free_SAs(struct pktgen_dev *pkt_dev) +static void free_SAs(struct pktgen_dev *pkt_dev) { if (pkt_dev->cflows) { /* let go of the SAs if we have them */ - int i = 0; - for (; i < pkt_dev->cflows; i++) { + int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); @@ -2457,7 +2550,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev) } } -static inline int process_ipsec(struct pktgen_dev *pkt_dev, +static int process_ipsec(struct pktgen_dev *pkt_dev, struct sk_buff *skb, __be16 protocol) { if (pkt_dev->flags & F_IPSEC_ON) { @@ -2467,11 +2560,11 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, int ret; __u8 *eth; nhead = x->props.header_len - skb_headroom(skb); - if (nhead >0) { + if (nhead > 0) { ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); if (ret < 0) { - printk(KERN_ERR "Error expanding " - "ipsec packet %d\n",ret); + pr_err("Error expanding ipsec packet %d\n", + ret); goto err; } } @@ -2480,14 +2573,13 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, skb_pull(skb, ETH_HLEN); ret = pktgen_output_ipsec(skb, pkt_dev); if (ret) { - printk(KERN_ERR "Error creating ipsec " - "packet %d\n",ret); + pr_err("Error creating ipsec packet %d\n", ret); goto err; } /* restore ll */ eth = (__u8 *) skb_push(skb, ETH_HLEN); memcpy(eth, pkt_dev->hh, 12); - *(u16 *) & eth[12] = protocol; + *(u16 *) ð[12] = protocol; } } return 1; @@ -2500,9 +2592,9 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned i; - for (i = 0; i < pkt_dev->nr_labels; i++) { + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; - } + mpls--; *mpls |= MPLS_STACK_BOTTOM; } @@ -2539,16 +2631,36 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; datalen = (odev->hard_header_len + 16) & ~0xf; - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen + - pkt_dev->pkt_overhead, GFP_ATOMIC); + + if (pkt_dev->flags & F_NODE) { + int node; + + if (pkt_dev->node >= 0) + node = pkt_dev->node; + else + node = numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = odev; + } + } + else + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); + if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } + prefetchw(skb->data); skb_reserve(skb, datalen); @@ -2579,6 +2691,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + iph = ip_hdr(skb); udph = udp_hdr(skb); @@ -2603,6 +2717,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->protocol = IPPROTO_UDP; /* UDP */ iph->saddr = pkt_dev->cur_saddr; iph->daddr = pkt_dev->cur_daddr; + iph->id = htons(pkt_dev->ip_id); + pkt_dev->ip_id++; iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); @@ -2614,24 +2730,26 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; - if (pkt_dev->nfrags <= 0) + if (pkt_dev->nfrags <= 0) { pgh = (struct pktgen_hdr *)skb_put(skb, datalen); - else { + memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr)); + } else { int frags = pkt_dev->nfrags; - int i; + int i, len; pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); if (frags > MAX_SKB_FRAGS) frags = MAX_SKB_FRAGS; if (datalen > frags * PAGE_SIZE) { - skb_put(skb, datalen - frags * PAGE_SIZE); + len = datalen - frags * PAGE_SIZE; + memset(skb_put(skb, len), 0, len); datalen = frags * PAGE_SIZE; } i = 0; while (datalen > 0) { - struct page *page = alloc_pages(GFP_KERNEL, 0); + struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); skb_shinfo(skb)->frags[i].page = page; skb_shinfo(skb)->frags[i].page_offset = 0; skb_shinfo(skb)->frags[i].size = @@ -2668,8 +2786,9 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, } } - /* Stamp the time, and sequence number, convert them to network byte order */ - + /* Stamp the time, and sequence number, + * convert them to network byte order + */ if (pgh) { struct timeval timestamp; @@ -2879,15 +2998,17 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 + - pkt_dev->pkt_overhead, GFP_ATOMIC); + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } + prefetchw(skb->data); skb_reserve(skb, 16); @@ -2918,11 +3039,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; iph = ipv6_hdr(skb); udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *) ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - @@ -2932,8 +3054,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, if (datalen < sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); if (net_ratelimit()) - printk(KERN_INFO "pktgen: increased datalen to %d\n", - datalen); + pr_info("increased datalen to %d\n", datalen); } udph->source = htons(pkt_dev->cur_udp_src); @@ -3016,8 +3137,10 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, } } - /* Stamp the time, and sequence number, convert them to network byte order */ - /* should we update cloned packets too ? */ + /* Stamp the time, and sequence number, + * convert them to network byte order + * should we update cloned packets too ? + */ if (pgh) { struct timeval timestamp; @@ -3033,8 +3156,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, return skb; } -static inline struct sk_buff *fill_packet(struct net_device *odev, - struct pktgen_dev *pkt_dev) +static struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) { if (pkt_dev->flags & F_IPV6) return fill_packet_ipv6(odev, pkt_dev); @@ -3058,7 +3181,7 @@ static void pktgen_run(struct pktgen_thread *t) struct pktgen_dev *pkt_dev; int started = 0; - pr_debug("pktgen: entering pktgen_run. %p\n", t); + func_enter(); if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { @@ -3072,9 +3195,9 @@ static void pktgen_run(struct pktgen_thread *t) pktgen_clear_counters(pkt_dev); pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; - pkt_dev->started_at = getCurUs(); - pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ - pkt_dev->next_tx_ns = 0; + pkt_dev->started_at = + pkt_dev->next_tx = ktime_now(); + set_pkt_overhead(pkt_dev); strcpy(pkt_dev->result, "Starting"); @@ -3091,7 +3214,7 @@ static void pktgen_stop_all_threads_ifs(void) { struct pktgen_thread *t; - pr_debug("pktgen: entering pktgen_stop_all_threads_ifs.\n"); + func_enter(); mutex_lock(&pktgen_thread_lock); @@ -3101,17 +3224,14 @@ static void pktgen_stop_all_threads_ifs(void) mutex_unlock(&pktgen_thread_lock); } -static int thread_is_running(struct pktgen_thread *t) +static int thread_is_running(const struct pktgen_thread *t) { - struct pktgen_dev *pkt_dev; - int res = 0; + const struct pktgen_dev *pkt_dev; list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) { - res = 1; - break; - } - return res; + if (pkt_dev->running) + return 1; + return 0; } static int pktgen_wait_thread_run(struct pktgen_thread *t) @@ -3159,7 +3279,7 @@ static void pktgen_run_all_threads(void) { struct pktgen_thread *t; - pr_debug("pktgen: entering pktgen_run_all_threads.\n"); + func_enter(); mutex_lock(&pktgen_thread_lock); @@ -3168,7 +3288,8 @@ static void pktgen_run_all_threads(void) mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); pktgen_wait_all_threads_run(); } @@ -3177,7 +3298,7 @@ static void pktgen_reset_all_threads(void) { struct pktgen_thread *t; - pr_debug("pktgen: entering pktgen_reset_all_threads.\n"); + func_enter(); mutex_lock(&pktgen_thread_lock); @@ -3186,35 +3307,29 @@ static void pktgen_reset_all_threads(void) mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); pktgen_wait_all_threads_run(); } static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) { - __u64 total_us, bps, mbps, pps, idle; + __u64 bps, mbps, pps; char *p = pkt_dev->result; - - total_us = pkt_dev->stopped_at - pkt_dev->started_at; - - idle = pkt_dev->idle_acc; - - p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", - (unsigned long long)total_us, - (unsigned long long)(total_us - idle), - (unsigned long long)idle, + ktime_t elapsed = ktime_sub(pkt_dev->stopped_at, + pkt_dev->started_at); + ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); + + p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", + (unsigned long long)ktime_to_us(elapsed), + (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), + (unsigned long long)ktime_to_us(idle), (unsigned long long)pkt_dev->sofar, pkt_dev->cur_pkt_size, nr_frags); - pps = pkt_dev->sofar * USEC_PER_SEC; - - while ((total_us >> 32) != 0) { - pps >>= 1; - total_us >>= 1; - } - - do_div(pps, total_us); + pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, + ktime_to_ns(elapsed)); bps = pps * 8 * pkt_dev->cur_pkt_size; @@ -3228,18 +3343,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) } /* Set stopped-at timer, remove from running list, do counters & statistics */ - static int pktgen_stop_device(struct pktgen_dev *pkt_dev) { int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; if (!pkt_dev->running) { - printk(KERN_WARNING "pktgen: interface: %s is already " - "stopped\n", pkt_dev->odev->name); + pr_warning("interface: %s is already stopped\n", + pkt_dev->odevname); return -EINVAL; } - pkt_dev->stopped_at = getCurUs(); + kfree_skb(pkt_dev->skb); + pkt_dev->skb = NULL; + pkt_dev->stopped_at = ktime_now(); pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3258,7 +3374,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) continue; if (best == NULL) best = pkt_dev; - else if (pkt_dev->next_tx_us < best->next_tx_us) + else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) best = pkt_dev; } if_unlock(t); @@ -3269,15 +3385,12 @@ static void pktgen_stop(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev; - pr_debug("pktgen: entering pktgen_stop\n"); + func_enter(); if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); - kfree_skb(pkt_dev->skb); - - pkt_dev->skb = NULL; } if_unlock(t); @@ -3292,7 +3405,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) struct list_head *q, *n; struct pktgen_dev *cur; - pr_debug("pktgen: entering pktgen_rem_one_if\n"); + func_enter(); if_lock(t); @@ -3318,9 +3431,10 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) struct list_head *q, *n; struct pktgen_dev *cur; + func_enter(); + /* Remove all devices, free mem */ - pr_debug("pktgen: entering pktgen_rem_all_ifs\n"); if_lock(t); list_for_each_safe(q, n, &t->if_list) { @@ -3341,163 +3455,123 @@ static void pktgen_rem_thread(struct pktgen_thread *t) remove_proc_entry(t->tsk->comm, pg_proc_dir); - mutex_lock(&pktgen_thread_lock); +} - list_del(&t->th_list); +static void pktgen_resched(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + schedule(); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +} - mutex_unlock(&pktgen_thread_lock); +static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + + while (atomic_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) + break; + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); } -static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; - int (*xmit)(struct sk_buff *, struct net_device *) + netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; - __u64 idle_start = 0; u16 queue_map; int ret; - if (pkt_dev->delay_us || pkt_dev->delay_ns) { - u64 now; - - now = getCurUs(); - if (now < pkt_dev->next_tx_us) - spin(pkt_dev, pkt_dev->next_tx_us); - - /* This is max DELAY, this has special meaning of - * "never transmit" - */ - if (pkt_dev->delay_us == 0x7FFFFFFF) { - pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; - pkt_dev->next_tx_ns = pkt_dev->delay_ns; - goto out; - } + /* If device is offline, then don't send */ + if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { + pktgen_stop_device(pkt_dev); + return; } - if (!pkt_dev->skb) { - set_cur_queue_map(pkt_dev); - queue_map = pkt_dev->cur_queue_map; - } else { - queue_map = skb_get_queue_mapping(pkt_dev->skb); + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (unlikely(pkt_dev->delay == ULLONG_MAX)) { + pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); + return; } - txq = netdev_get_tx_queue(odev, queue_map); - if (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq) || - need_resched()) { - idle_start = getCurUs(); - - if (!netif_running(odev)) { - pktgen_stop_device(pkt_dev); - kfree_skb(pkt_dev->skb); - pkt_dev->skb = NULL; - goto out; - } - if (need_resched()) - schedule(); - - pkt_dev->idle_acc += getCurUs() - idle_start; + /* If no skb or clone count exhausted then get new one */ + if (!pkt_dev->skb || (pkt_dev->last_ok && + ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + /* build a new pkt */ + kfree_skb(pkt_dev->skb); - if (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq)) { - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; - goto out; /* Try the next interface */ + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + pr_err("ERROR: couldn't allocate skb in fill_packet\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + return; } + pkt_dev->last_pkt_size = pkt_dev->skb->len; + pkt_dev->allocated_skbs++; + pkt_dev->clone_count = 0; /* reset counter */ } - if (pkt_dev->last_ok || !pkt_dev->skb) { - if ((++pkt_dev->clone_count >= pkt_dev->clone_skb) - || (!pkt_dev->skb)) { - /* build a new pkt */ - kfree_skb(pkt_dev->skb); - - pkt_dev->skb = fill_packet(odev, pkt_dev); - if (pkt_dev->skb == NULL) { - printk(KERN_ERR "pktgen: ERROR: couldn't " - "allocate skb in fill_packet.\n"); - schedule(); - pkt_dev->clone_count--; /* back out increment, OOM */ - goto out; - } - pkt_dev->allocated_skbs++; - pkt_dev->clone_count = 0; /* reset counter */ - } - } + if (pkt_dev->delay && pkt_dev->last_ok) + spin(pkt_dev, pkt_dev->next_tx); - /* fill_packet() might have changed the queue */ queue_map = skb_get_queue_mapping(pkt_dev->skb); txq = netdev_get_tx_queue(odev, queue_map); __netif_tx_lock_bh(txq); - if (!netif_tx_queue_stopped(txq) && - !netif_tx_queue_frozen(txq)) { - - atomic_inc(&(pkt_dev->skb->users)); - retry_now: - ret = (*xmit)(pkt_dev->skb, odev); - if (likely(ret == NETDEV_TX_OK)) { - txq_trans_update(txq); - pkt_dev->last_ok = 1; - pkt_dev->sofar++; - pkt_dev->seq_num++; - pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; - - } else if (ret == NETDEV_TX_LOCKED - && (odev->features & NETIF_F_LLTX)) { - cpu_relax(); - goto retry_now; - } else { /* Retry it next time */ - - atomic_dec(&(pkt_dev->skb->users)); - - if (debug && net_ratelimit()) - printk(KERN_INFO "pktgen: Hard xmit error\n"); - - pkt_dev->errors++; - pkt_dev->last_ok = 0; - } - - pkt_dev->next_tx_us = getCurUs(); - pkt_dev->next_tx_ns = 0; - pkt_dev->next_tx_us += pkt_dev->delay_us; - pkt_dev->next_tx_ns += pkt_dev->delay_ns; - - if (pkt_dev->next_tx_ns > 1000) { - pkt_dev->next_tx_us++; - pkt_dev->next_tx_ns -= 1000; - } - } - - else { /* Retry it next time */ + if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { + ret = NETDEV_TX_BUSY; + pkt_dev->last_ok = 0; + goto unlock; + } + atomic_inc(&(pkt_dev->skb->users)); + ret = (*xmit)(pkt_dev->skb, odev); + + switch (ret) { + case NETDEV_TX_OK: + txq_trans_update(txq); + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + case NET_XMIT_POLICED: + /* skb has been consumed */ + pkt_dev->errors++; + break; + default: /* Drivers are not supposed to return other values! */ + if (net_ratelimit()) + pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret); + pkt_dev->errors++; + /* fallthru */ + case NETDEV_TX_LOCKED: + case NETDEV_TX_BUSY: + /* Retry it next time */ + atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; } - +unlock: __netif_tx_unlock_bh(txq); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { - if (atomic_read(&(pkt_dev->skb->users)) != 1) { - idle_start = getCurUs(); - while (atomic_read(&(pkt_dev->skb->users)) != 1) { - if (signal_pending(current)) { - break; - } - schedule(); - } - pkt_dev->idle_acc += getCurUs() - idle_start; - } + pktgen_wait_for_skb(pkt_dev); /* Done with this */ pktgen_stop_device(pkt_dev); - kfree_skb(pkt_dev->skb); - pkt_dev->skb = NULL; } -out:; } /* @@ -3516,7 +3590,7 @@ static int pktgen_thread_worker(void *arg) init_waitqueue_head(&t->queue); complete(&t->start_done); - pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); + pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); set_current_state(TASK_INTERRUPTIBLE); @@ -3525,20 +3599,27 @@ static int pktgen_thread_worker(void *arg) while (!kthread_should_stop()) { pkt_dev = next_to_run(t); - if (!pkt_dev && - (t->control & (T_STOP | T_RUN | T_REMDEVALL | T_REMDEV)) - == 0) { - prepare_to_wait(&(t->queue), &wait, - TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); - finish_wait(&(t->queue), &wait); + if (unlikely(!pkt_dev && t->control == 0)) { + if (pktgen_exiting) + break; + wait_event_interruptible_timeout(t->queue, + t->control != 0, + HZ/10); + try_to_freeze(); + continue; } __set_current_state(TASK_RUNNING); - if (pkt_dev) + if (likely(pkt_dev)) { pktgen_xmit(pkt_dev); + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + if (t->control & T_STOP) { pktgen_stop(t); t->control &= ~(T_STOP); @@ -3564,32 +3645,44 @@ static int pktgen_thread_worker(void *arg) set_current_state(TASK_INTERRUPTIBLE); } - pr_debug("pktgen: %s stopping all device\n", t->tsk->comm); + pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); - pr_debug("pktgen: %s removing all device\n", t->tsk->comm); + pr_debug("%s removing all device\n", t->tsk->comm); pktgen_rem_all_ifs(t); - pr_debug("pktgen: %s removing thread.\n", t->tsk->comm); + pr_debug("%s removing thread\n", t->tsk->comm); pktgen_rem_thread(t); + /* Wait for kthread_stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; } static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, - const char *ifname) + const char *ifname, bool exact) { struct pktgen_dev *p, *pkt_dev = NULL; - if_lock(t); + size_t len = strlen(ifname); + if_lock(t); list_for_each_entry(p, &t->if_list, list) - if (strncmp(p->odev->name, ifname, IFNAMSIZ) == 0) { + if (strncmp(p->odevname, ifname, len) == 0) { + if (p->odevname[len]) { + if (exact || p->odevname[len] != '@') + continue; + } pkt_dev = p; break; } if_unlock(t); - pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev); + pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3605,8 +3698,7 @@ static int add_dev_to_thread(struct pktgen_thread *t, if_lock(t); if (pkt_dev->pg_thread) { - printk(KERN_ERR "pktgen: ERROR: already assigned " - "to a thread.\n"); + pr_err("ERROR: already assigned to a thread\n"); rv = -EBUSY; goto out; } @@ -3626,20 +3718,23 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; int err; + int node = cpu_to_node(t->cpu); /* We don't allow a device to be on several threads */ pkt_dev = __pktgen_NN_threads(ifname, FIND); if (pkt_dev) { - printk(KERN_ERR "pktgen: ERROR: interface already used.\n"); + pr_err("ERROR: interface already used\n"); return -EBUSY; } - pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); if (!pkt_dev) return -ENOMEM; - pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state)); + strcpy(pkt_dev->odevname, ifname); + pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + node); if (pkt_dev->flows == NULL) { kfree(pkt_dev); return -ENOMEM; @@ -3651,8 +3746,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->delay_us = pg_delay_d / 1000; - pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->delay = pg_delay_d; pkt_dev->count = pg_count_d; pkt_dev->sofar = 0; pkt_dev->udp_src_min = 9; /* sink port */ @@ -3666,6 +3760,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; + pkt_dev->node = -1; err = pktgen_setup_dev(pkt_dev, ifname); if (err) @@ -3674,7 +3769,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, &pktgen_if_fops, pkt_dev); if (!pkt_dev->entry) { - printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n", + pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, ifname); err = -EINVAL; goto out2; @@ -3702,10 +3797,10 @@ static int __init pktgen_create_thread(int cpu) struct proc_dir_entry *pe; struct task_struct *p; - t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL); + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); if (!t) { - printk(KERN_ERR "pktgen: ERROR: out of memory, can't " - "create new thread.\n"); + pr_err("ERROR: out of memory, can't create new thread\n"); return -ENOMEM; } @@ -3719,8 +3814,7 @@ static int __init pktgen_create_thread(int cpu) p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); if (IS_ERR(p)) { - printk(KERN_ERR "pktgen: kernel_thread() failed " - "for cpu %d\n", t->cpu); + pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); @@ -3731,7 +3825,7 @@ static int __init pktgen_create_thread(int cpu) pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, &pktgen_thread_fops, t); if (!pe) { - printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n", + pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, t->tsk->comm); kthread_stop(p); list_del(&t->th_list); @@ -3765,11 +3859,10 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) { - pr_debug("pktgen: remove_device pkt_dev=%p\n", pkt_dev); + pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { - printk(KERN_WARNING "pktgen: WARNING: trying to remove a " - "running interface, stopping it now.\n"); + pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); pktgen_stop_device(pkt_dev); } @@ -3800,7 +3893,7 @@ static int __init pg_init(void) int cpu; struct proc_dir_entry *pe; - printk(KERN_INFO "%s", version); + pr_info("%s", version); pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); if (!pg_proc_dir) @@ -3808,8 +3901,7 @@ static int __init pg_init(void) pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); if (pe == NULL) { - printk(KERN_ERR "pktgen: ERROR: cannot create %s " - "procfs entry.\n", PGCTRL); + pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); proc_net_remove(&init_net, PG_PROC_DIR); return -EINVAL; } @@ -3822,13 +3914,12 @@ static int __init pg_init(void) err = pktgen_create_thread(cpu); if (err) - printk(KERN_WARNING "pktgen: WARNING: Cannot create " - "thread for cpu %d (%d)\n", cpu, err); + pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", + cpu, err); } if (list_empty(&pktgen_threads)) { - printk(KERN_ERR "pktgen: ERROR: Initialization failed for " - "all threads\n"); + pr_err("ERROR: Initialization failed for all threads\n"); unregister_netdevice_notifier(&pktgen_notifier_block); remove_proc_entry(PGCTRL, pg_proc_dir); proc_net_remove(&init_net, PG_PROC_DIR); @@ -3842,10 +3933,9 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - wait_queue_head_t queue; - init_waitqueue_head(&queue); /* Stop all interfaces & threads */ + pktgen_exiting = true; list_for_each_safe(q, n, &pktgen_threads) { t = list_entry(q, struct pktgen_thread, th_list); @@ -3864,10 +3954,15 @@ static void __exit pg_cleanup(void) module_init(pg_init); module_exit(pg_cleanup); -MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se"); +MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>"); MODULE_DESCRIPTION("Packet Generator tool"); MODULE_LICENSE("GPL"); +MODULE_VERSION(VERSION); module_param(pg_count_d, int, 0); +MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject"); module_param(pg_delay_d, int, 0); +MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)"); module_param(pg_clone_skb_d, int, 0); +MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet"); module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Enable debugging of pktgen module"); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495aff7..182236b2510 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -33,6 +33,7 @@ * Note : Dont forget somaxconn that may limit backlog too. */ int sysctl_max_syn_backlog = 256; +EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) @@ -45,9 +46,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d78030f88bd..a5f7535aab5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -35,11 +35,10 @@ #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> -#include <linux/nsproxy.h> +#include <linux/pci.h> #include <asm/uaccess.h> #include <asm/system.h> -#include <asm/string.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -52,9 +51,9 @@ #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> +#include <net/net_namespace.h> -struct rtnl_link -{ +struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; }; @@ -65,6 +64,7 @@ void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_lock); void __rtnl_unlock(void) { @@ -76,18 +76,29 @@ void rtnl_unlock(void) /* This fellow will unlock it for us. */ netdev_run_todo(); } +EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_is_locked); -static struct rtnl_link *rtnl_msg_handlers[NPROTO]; +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rtnl_is_held(void) +{ + return lockdep_is_held(&rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_is_held); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ + +static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -107,7 +118,11 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) { struct rtnl_link *tab; - tab = rtnl_msg_handlers[protocol]; + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + if (tab == NULL || tab[msgindex].doit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; @@ -118,7 +133,11 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) { struct rtnl_link *tab; - tab = rtnl_msg_handlers[protocol]; + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + if (tab == NULL || tab[msgindex].dumpit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; @@ -148,7 +167,7 @@ int __rtnl_register(int protocol, int msgtype, struct rtnl_link *tab; int msgindex; - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); tab = rtnl_msg_handlers[protocol]; @@ -168,7 +187,6 @@ int __rtnl_register(int protocol, int msgtype, return 0; } - EXPORT_SYMBOL_GPL(__rtnl_register); /** @@ -188,7 +206,6 @@ void rtnl_register(int protocol, int msgtype, "protocol = %d, message type = %d\n", protocol, msgtype); } - EXPORT_SYMBOL_GPL(rtnl_register); /** @@ -202,7 +219,7 @@ int rtnl_unregister(int protocol, int msgtype) { int msgindex; - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); if (rtnl_msg_handlers[protocol] == NULL) @@ -213,7 +230,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } - EXPORT_SYMBOL_GPL(rtnl_unregister); /** @@ -225,12 +241,11 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - BUG_ON(protocol < 0 || protocol >= NPROTO); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); kfree(rtnl_msg_handlers[protocol]); rtnl_msg_handlers[protocol] = NULL; } - EXPORT_SYMBOL_GPL(rtnl_unregister_all); static LIST_HEAD(link_ops); @@ -248,12 +263,11 @@ static LIST_HEAD(link_ops); int __rtnl_link_register(struct rtnl_link_ops *ops) { if (!ops->dellink) - ops->dellink = unregister_netdevice; + ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; } - EXPORT_SYMBOL_GPL(__rtnl_link_register); /** @@ -271,29 +285,20 @@ int rtnl_link_register(struct rtnl_link_ops *ops) rtnl_unlock(); return err; } - EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; -restart: + LIST_HEAD(list_kill); + for_each_netdev(net, dev) { - if (dev->rtnl_link_ops == ops) { - ops->dellink(dev); - goto restart; - } + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); } + unregister_netdevice_many(&list_kill); } -void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -{ - rtnl_lock(); - __rtnl_kill_links(net, ops); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(rtnl_kill_links); - /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister @@ -309,7 +314,6 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } list_del(&ops->list); } - EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /** @@ -322,7 +326,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) __rtnl_link_unregister(ops); rtnl_unlock(); } - EXPORT_SYMBOL_GPL(rtnl_link_unregister); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) @@ -344,16 +347,106 @@ static size_t rtnl_link_get_size(const struct net_device *dev) if (!ops) return 0; - size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ - nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ - size += nlmsg_total_size(sizeof(struct nlattr)) + + size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) - size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); + + return size; +} + +static LIST_HEAD(rtnl_af_ops); + +static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +{ + const struct rtnl_af_ops *ops; + + list_for_each_entry(ops, &rtnl_af_ops, list) { + if (ops->family == family) + return ops; + } + + return NULL; +} + +/** + * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * The caller must hold the rtnl_mutex. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_af_register(struct rtnl_af_ops *ops) +{ + list_add_tail(&ops->list, &rtnl_af_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_af_register); + +/** + * rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_af_register(struct rtnl_af_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_af_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_af_register); + +/** + * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_af_unregister); + +/** + * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + */ +void rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + __rtnl_af_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_af_unregister); + +static size_t rtnl_link_get_af_size(const struct net_device *dev) +{ + struct rtnl_af_ops *af_ops; + size_t size; + + /* IFLA_AF_SPEC */ + size = nla_total_size(sizeof(struct nlattr)); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_link_af_size) { + /* AF_* + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + af_ops->get_link_af_size(dev); + } + } return size; } @@ -427,12 +520,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data struct rtattr *rta; int size = RTA_LENGTH(attrlen); - rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); rta->rta_type = attrtype; rta->rta_len = size; memcpy(RTA_DATA(rta), data, attrlen); memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); } +EXPORT_SYMBOL(__rta_fill); int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) { @@ -454,6 +548,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } +EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) @@ -466,6 +561,7 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, nlmsg_notify(rtnl, skb, pid, group, report, flags); } +EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { @@ -473,6 +569,7 @@ void rtnl_set_sk_err(struct net *net, u32 group, int error) netlink_set_err(rtnl, 0, group, error); } +EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { @@ -501,6 +598,7 @@ nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } +EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, u32 ts, u32 tsage, long expires, u32 error) @@ -520,14 +618,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } - EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; - switch(transition) { + switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_UNKNOWN) && @@ -550,8 +647,21 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } } +static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) +{ + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (dev->flags & ~ifm->ifi_change); + + return flags; +} + static void copy_rtnl_link_stats(struct rtnl_link_stats *a, - const struct net_device_stats *b) + const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; @@ -580,9 +690,56 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; -}; +} -static inline size_t if_nlmsg_size(const struct net_device *dev) +static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) +{ + memcpy(v, b, sizeof(*b)); +} + +/* All VF info */ +static inline int rtnl_vfinfo_size(const struct net_device *dev) +{ + if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { + + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nla_total_size(sizeof(struct nlattr)); + size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * + (nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate))); + return size; + } else + return 0; +} + +static size_t rtnl_port_size(const struct net_device *dev) +{ + size_t port_size = nla_total_size(4) /* PORT_VF */ + + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + + nla_total_size(sizeof(struct ifla_port_vsi)) + /* PORT_VSI_TYPE */ + + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + + nla_total_size(1) /* PROT_VDP_REQUEST */ + + nla_total_size(2); /* PORT_VDP_RESPONSE */ + size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); + size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + if (dev_num_vf(dev->dev.parent)) + return port_self_size + vf_ports_size + + vf_port_size * dev_num_vf(dev->dev.parent); + else + return port_self_size; +} + +static noinline size_t if_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -590,6 +747,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -599,18 +757,98 @@ static inline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ - + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ + + nla_total_size(4) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ +} + +static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *vf_ports; + struct nlattr *vf_port; + int vf; + int err; + + vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + if (!vf_ports) + return -EMSGSIZE; + + for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { + vf_port = nla_nest_start(skb, IFLA_VF_PORT); + if (!vf_port) + goto nla_put_failure; + NLA_PUT_U32(skb, IFLA_PORT_VF, vf); + err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err == -EMSGSIZE) + goto nla_put_failure; + if (err) { + nla_nest_cancel(skb, vf_port); + continue; + } + nla_nest_end(skb, vf_port); + } + + nla_nest_end(skb, vf_ports); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; +} + +static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *port_self; + int err; + + port_self = nla_nest_start(skb, IFLA_PORT_SELF); + if (!port_self) + return -EMSGSIZE; + + err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); + if (err) { + nla_nest_cancel(skb, port_self); + return (err == -EMSGSIZE) ? err : 0; + } + + nla_nest_end(skb, port_self); + + return 0; +} + +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + + err = rtnl_port_self_fill(skb, dev); + if (err) + return err; + + if (dev_num_vf(dev->dev.parent)) { + err = rtnl_vf_ports_fill(skb, dev); + if (err) + return err; + } + + return 0; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags) { - struct netdev_queue *txq; struct ifinfomsg *ifm; struct nlmsghdr *nlh; - const struct net_device_stats *stats; - struct nlattr *attr; + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats; + struct nlattr *attr, *af_spec; + struct rtnl_af_ops *af_ops; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) @@ -637,9 +875,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (dev->master) NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); - txq = netdev_get_tx_queue(dev, 0); - if (txq->qdisc_sleeping) - NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id); + if (dev->qdisc) + NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id); if (dev->ifalias) NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); @@ -666,14 +903,90 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (attr == NULL) goto nla_put_failure; - stats = dev_get_stats(dev); + stats = dev_get_stats(dev, &temp); copy_rtnl_link_stats(nla_data(attr), stats); + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (attr == NULL) + goto nla_put_failure; + copy_rtnl_link_stats64(nla_data(attr), stats); + + if (dev->dev.parent) + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + int i; + + struct nlattr *vfinfo, *vf; + int num_vfs = dev_num_vf(dev->dev.parent); + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + goto nla_put_failure; + for (i = 0; i < num_vfs; i++) { + struct ifla_vf_info ivi; + struct ifla_vf_mac vf_mac; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_tx_rate vf_tx_rate; + if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) + break; + vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.tx_rate; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); + NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + nla_nest_end(skb, vf); + } + nla_nest_end(skb, vfinfo); + } + + if (rtnl_port_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_link_af) { + struct nlattr *af; + int err; + + if (!(af = nla_nest_start(skb, af_ops->family))) + goto nla_put_failure; + + err = af_ops->fill_link_af(skb, dev); + + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, af_spec); + return nlmsg_end(skb, nlh); nla_put_failure: @@ -684,22 +997,33 @@ nla_put_failure: static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx; - int s_idx = cb->args[0]; + int h, s_h; + int idx = 0, s_idx; struct net_device *dev; - - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) - break; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI) <= 0) + goto out; cont: - idx++; + idx++; + } } - cb->args[0] = idx; +out: + cb->args[1] = idx; + cb->args[0] = h; return skb->len; } @@ -718,31 +1042,58 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, }; +EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; -static struct net *get_net_ns_by_pid(pid_t pid) +static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { + [IFLA_VF_INFO] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, +}; + +static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { + [IFLA_PORT_VF] = { .type = NLA_U32 }, + [IFLA_PORT_PROFILE] = { .type = NLA_STRING, + .len = PORT_PROFILE_MAX }, + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi)}, + [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, + .len = PORT_UUID_MAX }, + [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, + .len = PORT_UUID_MAX }, + [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, + [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, +}; + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { - struct task_struct *tsk; struct net *net; - - /* Lookup the network namespace */ - net = ERR_PTR(-ESRCH); - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) { - struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); - if (nsproxy) - net = get_net(nsproxy->net_ns); - } - rcu_read_unlock(); + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else + net = get_net(src_net); return net; } +EXPORT_SYMBOL(rtnl_link_get_net); static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { @@ -756,9 +1107,77 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem, err; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + return -EAFNOSUPPORT; + + if (!af_ops->set_link_af) + return -EOPNOTSUPP; + + if (af_ops->validate_link_af) { + err = af_ops->validate_link_af(dev, + tb[IFLA_AF_SPEC]); + if (err < 0) + return err; + } + } + } + return 0; } +static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +{ + int rem, err = -EINVAL; + struct nlattr *vf; + const struct net_device_ops *ops = dev->netdev_ops; + + nla_for_each_nested(vf, attr, rem) { + switch (nla_type(vf)) { + case IFLA_VF_MAC: { + struct ifla_vf_mac *ivm; + ivm = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + break; + } + case IFLA_VF_VLAN: { + struct ifla_vf_vlan *ivv; + ivv = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + ivv->vlan, + ivv->qos); + break; + } + case IFLA_VF_TX_RATE: { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, + ivt->rate); + break; + } + default: + err = -EINVAL; + break; + } + if (err) + break; + } + return err; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { @@ -767,8 +1186,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int err; if (tb[IFLA_NET_NS_PID]) { - struct net *net; - net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; @@ -873,13 +1291,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, } if (ifm->ifi_flags || ifm->ifi_change) { - unsigned int flags = ifm->ifi_flags; - - /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ - if (ifm->ifi_change) - flags = (flags & ifm->ifi_change) | - (dev->flags & ~ifm->ifi_change); - err = dev_change_flags(dev, flags); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); if (err < 0) goto errout; } @@ -896,6 +1308,85 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, write_unlock_bh(&dev_base_lock); } + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *attr; + int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO) { + err = -EINVAL; + goto errout; + } + err = do_setvfinfo(dev, attr); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_VF_PORTS]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + struct nlattr *attr; + int vf; + int rem; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_port) + goto errout; + + nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { + if (nla_type(attr) != IFLA_VF_PORT) + continue; + err = nla_parse_nested(port, IFLA_PORT_MAX, + attr, ifla_port_policy); + if (err < 0) + goto errout; + if (!port[IFLA_PORT_VF]) { + err = -EOPNOTSUPP; + goto errout; + } + vf = nla_get_u32(port[IFLA_PORT_VF]); + err = ops->ndo_set_vf_port(dev, vf, port); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_PORT_SELF]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + + err = nla_parse_nested(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], ifla_port_policy); + if (err < 0) + goto errout; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_port) + err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + BUG(); + + err = af_ops->set_link_af(dev, af); + if (err < 0) + goto errout; + + modified = 1; + } + } err = 0; errout: @@ -931,9 +1422,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = dev_get_by_name(net, ifname); + dev = __dev_get_by_name(net, ifname); else goto errout; @@ -942,12 +1433,11 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) goto errout; } - if ((err = validate_linkmsg(dev, tb)) < 0) - goto errout_dev; + err = validate_linkmsg(dev, tb); + if (err < 0) + goto errout; err = do_setlink(dev, ifm, tb, ifname, 0); -errout_dev: - dev_put(dev); errout: return err; } @@ -984,30 +1474,60 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (!ops) return -EOPNOTSUPP; - ops->dellink(dev); + ops->dellink(dev, NULL); return 0; } -struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +{ + unsigned int old_flags; + int err; + + old_flags = dev->flags; + if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + return err; + } + + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + __dev_notify_flags(dev, old_flags); + return 0; +} +EXPORT_SYMBOL(rtnl_configure_link); + +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; + unsigned int num_queues = 1; + unsigned int real_num_queues = 1; + if (ops->get_tx_queues) { + err = ops->get_tx_queues(src_net, tb, &num_queues, + &real_num_queues); + if (err) + goto err; + } err = -ENOMEM; - dev = alloc_netdev(ops->priv_size, ifname, ops->setup); + dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); if (!dev) goto err; + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; + dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->real_num_tx_queues = real_num_queues; + if (strchr(dev->name, '%')) { err = dev_alloc_name(dev, dev->name); if (err < 0) goto err_free; } - dev_net_set(dev, net); - dev->rtnl_link_ops = ops; - if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); if (tb[IFLA_ADDRESS]) @@ -1030,6 +1550,7 @@ err_free: err: return ERR_PTR(err); } +EXPORT_SYMBOL(rtnl_create_link); static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -1063,7 +1584,8 @@ replay: else dev = NULL; - if ((err = validate_linkmsg(dev, tb)) < 0) + err = validate_linkmsg(dev, tb); + if (err < 0) return err; if (tb[IFLA_LINKINFO]) { @@ -1084,6 +1606,7 @@ replay: if (1) { struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; if (ops) { if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { @@ -1126,7 +1649,7 @@ replay: if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENODEV; - if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change) + if (ifm->ifi_index) return -EOPNOTSUPP; if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; @@ -1148,17 +1671,26 @@ replay: if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - dev = rtnl_create_link(net, ifname, ops, tb); + dest_net = rtnl_link_get_net(net, tb); + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); if (IS_ERR(dev)) err = PTR_ERR(dev); else if (ops->newlink) - err = ops->newlink(dev, tb, data); + err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); if (err < 0 && !IS_ERR(dev)) free_netdev(dev); + if (err < 0) + goto out; + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + unregister_netdevice(dev); +out: + put_net(dest_net); return err; } } @@ -1167,6 +1699,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -1176,19 +1709,23 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) return err; + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) { - dev = dev_get_by_index(net, ifm->ifi_index); - if (dev == NULL) - return -ENODEV; - } else + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else return -EINVAL; + if (dev == NULL) + return -ENODEV; + nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); - if (nskb == NULL) { - err = -ENOBUFS; - goto errout; - } + if (nskb == NULL) + return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, 0); @@ -1196,11 +1733,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); - goto errout; - } - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); -errout: - dev_put(dev); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); return err; } @@ -1212,7 +1746,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (s_idx == 0) s_idx = 1; - for (idx=1; idx<NPROTO; idx++) { + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { int type = cb->nlh->nlmsg_type-RTM_BASE; if (idx < s_idx || idx == PF_PACKET) continue; @@ -1279,17 +1813,14 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) return 0; - family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) - return -EAFNOSUPPORT; - + family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; sz_idx = type>>2; kind = type&3; if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) return -EPERM; - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + if (kind == 2 && (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; @@ -1312,7 +1843,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len > min_len) { int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); while (RTA_OK(attr, attrlen)) { unsigned flavor = attr->rta_type; @@ -1344,18 +1875,16 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi struct net_device *dev = ptr; switch (event) { - case NETDEV_UNREGISTER: - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - break; - case NETDEV_REGISTER: - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - break; case NETDEV_UP: case NETDEV_DOWN: - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); - break; + case NETDEV_PRE_UP: + case NETDEV_POST_INIT: + case NETDEV_REGISTER: case NETDEV_CHANGE: + case NETDEV_PRE_TYPE_CHANGE: case NETDEV_GOING_DOWN: + case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_BATCH: break; default: rtmsg_ifinfo(RTM_NEWLINK, dev, 0); @@ -1369,7 +1898,7 @@ static struct notifier_block rtnetlink_dev_notifier = { }; -static int rtnetlink_net_init(struct net *net) +static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, @@ -1380,7 +1909,7 @@ static int rtnetlink_net_init(struct net *net) return 0; } -static void rtnetlink_net_exit(struct net *net) +static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; @@ -1418,14 +1947,3 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); } -EXPORT_SYMBOL(__rta_fill); -EXPORT_SYMBOL(rtnetlink_put_metrics); -EXPORT_SYMBOL(rtnl_lock); -EXPORT_SYMBOL(rtnl_trylock); -EXPORT_SYMBOL(rtnl_unlock); -EXPORT_SYMBOL(rtnl_is_locked); -EXPORT_SYMBOL(rtnl_unicast); -EXPORT_SYMBOL(rtnl_notify); -EXPORT_SYMBOL(rtnl_set_sk_err); -EXPORT_SYMBOL(rtnl_create_link); -EXPORT_SYMBOL(ifla_policy); diff --git a/net/core/scm.c b/net/core/scm.c index b7ba91b074b..bbe45445080 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -26,6 +26,7 @@ #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> +#include <linux/slab.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -78,10 +79,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->max = SCM_MAX_FD; } fpp = &fpl->fp[fpl->count]; - if (fpl->count + num > SCM_MAX_FD) + if (fpl->count + num > fpl->max) return -EINVAL; /* @@ -129,6 +131,7 @@ void __scm_destroy(struct scm_cookie *scm) } } } +EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { @@ -156,6 +159,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: + if (!sock->ops || sock->ops->family != PF_UNIX) + goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) goto error; @@ -167,6 +172,30 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) err = scm_check_creds(&p->creds); if (err) goto error; + + if (pid_vnr(p->pid) != p->creds.pid) { + struct pid *pid; + err = -ESRCH; + pid = find_get_pid(p->creds.pid); + if (!pid) + goto error; + put_pid(p->pid); + p->pid = pid; + } + + if ((p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { + struct cred *cred; + err = -ENOMEM; + cred = prepare_creds(); + if (!cred) + goto error; + + cred->uid = cred->euid = p->creds.uid; + cred->gid = cred->egid = p->creds.uid; + put_cred(p->cred); + p->cred = cred; + } break; default: goto error; @@ -184,6 +213,7 @@ error: scm_destroy(p); return err; } +EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { @@ -222,6 +252,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) out: return err; } +EXPORT_SYMBOL(put_cmsg); void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { @@ -291,6 +322,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) */ __scm_destroy(scm); } +EXPORT_SYMBOL(scm_detach_fds); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) { @@ -300,17 +332,13 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (!fpl) return NULL; - new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + GFP_KERNEL); if (new_fpl) { - for (i=fpl->count-1; i>=0; i--) + for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); - memcpy(new_fpl, fpl, sizeof(*fpl)); + new_fpl->max = new_fpl->count; } return new_fpl; } - -EXPORT_SYMBOL(__scm_destroy); -EXPORT_SYMBOL(__scm_send); -EXPORT_SYMBOL(put_cmsg); -EXPORT_SYMBOL(scm_detach_fds); EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c deleted file mode 100644 index 79687dfd695..00000000000 --- a/net/core/skb_dma_map.c +++ /dev/null @@ -1,65 +0,0 @@ -/* skb_dma_map.c: DMA mapping helpers for socket buffers. - * - * Copyright (C) David S. Miller <davem@davemloft.net> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/dma-mapping.h> -#include <linux/skbuff.h> - -int skb_dma_map(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - dma_addr_t map; - int i; - - map = dma_map_single(dev, skb->data, - skb_headlen(skb), dir); - if (dma_mapping_error(dev, map)) - goto out_err; - - sp->dma_head = map; - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - map = dma_map_page(dev, fp->page, fp->page_offset, - fp->size, dir); - if (dma_mapping_error(dev, map)) - goto unwind; - sp->dma_maps[i] = map; - } - - return 0; - -unwind: - while (--i >= 0) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); -out_err: - return -ENOMEM; -} -EXPORT_SYMBOL(skb_dma_map); - -void skb_dma_unmap(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - int i; - - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } -} -EXPORT_SYMBOL(skb_dma_unmap); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9e0597d189b..19d6c21220f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -93,7 +93,7 @@ static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket. */ -static struct pipe_buf_operations sock_pipe_buf_ops = { +static const struct pipe_buf_operations sock_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -117,7 +117,7 @@ static struct pipe_buf_operations sock_pipe_buf_ops = { * * Out of line support code for skb_put(). Not user callable. */ -void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void skb_over_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%#lx end:%#lx dev:%s\n", @@ -126,7 +126,6 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : "<NULL>"); BUG(); } -EXPORT_SYMBOL(skb_over_panic); /** * skb_under_panic - private function @@ -137,7 +136,7 @@ EXPORT_SYMBOL(skb_over_panic); * Out of line support code for skb_push(). Not user callable. */ -void skb_under_panic(struct sk_buff *skb, int sz, void *here) +static void skb_under_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%#lx end:%#lx dev:%s\n", @@ -146,7 +145,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : "<NULL>"); BUG(); } -EXPORT_SYMBOL(skb_under_panic); /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the @@ -183,12 +181,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + prefetchw(skb); size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); if (!data) goto nodata; + prefetchw(data + size); /* * Only clear those fields we need to clear, not those that we will @@ -202,23 +202,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; - kmemcheck_annotate_bitfield(skb, flags1); - kmemcheck_annotate_bitfield(skb, flags2); #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; #endif /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->tx_flags.flags = 0; - skb_frag_list_init(skb); - memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -256,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask) { - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; struct sk_buff *skb; - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; @@ -268,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, } EXPORT_SYMBOL(__netdev_alloc_skb); -struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) -{ - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; - struct page *page; - - page = alloc_pages_node(node, gfp_mask, 0); - return page; -} -EXPORT_SYMBOL(__netdev_alloc_page); - void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size) { @@ -347,7 +327,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -473,6 +453,7 @@ void consume_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_consume_skb(skb); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); @@ -489,37 +470,34 @@ EXPORT_SYMBOL(consume_skb); * reference count dropping and cleans up the skbuff as if it * just came from __alloc_skb(). */ -int skb_recycle_check(struct sk_buff *skb, int skb_size) +bool skb_recycle_check(struct sk_buff *skb, int skb_size) { struct skb_shared_info *shinfo; + if (irqs_disabled()) + return false; + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) - return 0; + return false; skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); if (skb_end_pointer(skb) - skb->head < skb_size) - return 0; + return false; if (skb_shared(skb) || skb_cloned(skb)) - return 0; + return false; skb_release_head_state(skb); + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->tx_flags.flags = 0; - skb_frag_list_init(skb); - memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); memset(skb, 0, offsetof(struct sk_buff, tail)); skb->data = skb->head + NET_SKB_PAD; skb_reset_tail_pointer(skb); - return 1; + return true; } EXPORT_SYMBOL(skb_recycle_check); @@ -530,7 +508,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; - skb_dst_set(new, dst_clone(skb_dst(old))); + skb_dst_copy(new, old); + new->rxhash = old->rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -541,12 +520,13 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); new->priority = old->priority; + new->deliver_no_wcard = old->deliver_no_wcard; #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif new->protocol = old->protocol; new->mark = old->mark; - new->iif = old->iif; + new->skb_iif = old->skb_iif; __nf_copy(new, old); #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) @@ -559,9 +539,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #endif new->vlan_tci = old->vlan_tci; -#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) - new->do_not_encrypt = old->do_not_encrypt; -#endif skb_copy_secmark(new, old); } @@ -696,16 +673,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) return NULL; @@ -737,20 +708,14 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); + skb_reserve(n, skb_headroom(skb)); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -770,7 +735,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -802,12 +767,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; + bool fastpath; BUG_ON(nhead < 0); @@ -816,31 +778,56 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, size = SKB_DATA_ALIGN(size); + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; + + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } + + if (fastpath && + size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { + memmove(skb->head + size, skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + memmove(skb->head + nhead, skb->head, + skb_tail_pointer(skb) - skb->head); + off = nhead; + goto adjust_others; + } + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); if (!data) goto nodata; /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), - sizeof(struct skb_shared_info)); + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - if (skb_has_frags(skb)) - skb_clone_fraglist(skb); + if (fastpath) { + kfree(skb->head); + } else { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); - skb_release_data(skb); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } off = (data + nhead) - skb->head; skb->head = data; +adjust_others: skb->data += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->end = size; @@ -854,7 +841,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->network_header += off; if (skb_mac_header_was_set(skb)) skb->mac_header += off; - skb->csum_start += nhead; + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -941,7 +930,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); off = newheadroom - oldheadroom; - n->csum_start += off; + if (n->ip_summed == CHECKSUM_PARTIAL) + n->csum_start += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET n->transport_header += off; n->network_header += off; @@ -1051,7 +1041,7 @@ EXPORT_SYMBOL(skb_push); */ unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) { - return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); + return skb_pull_inline(skb, len); } EXPORT_SYMBOL(skb_pull); @@ -1107,7 +1097,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } @@ -1202,7 +1192,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_has_frags(skb)) + if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -1417,12 +1407,13 @@ new_page: /* * Fill page/offset/length into spd, if it can hold more pages. */ -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, +static inline int spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, struct sk_buff *skb, int linear, struct sock *sk) { - if (unlikely(spd->nr_pages == PIPE_BUFFERS)) + if (unlikely(spd->nr_pages == pipe->buffers)) return 1; if (linear) { @@ -1458,7 +1449,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, struct splice_pipe_desc *spd, int linear, - struct sock *sk) + struct sock *sk, + struct pipe_inode_info *pipe) { if (!*len) return 1; @@ -1481,7 +1473,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1496,9 +1488,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff, * Map linear and fragment data from the skb to spd. It reports failure if the * pipe is full or if we already spliced the requested length. */ -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *len, struct splice_pipe_desc *spd, - struct sock *sk) +static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, + unsigned int *offset, unsigned int *len, + struct splice_pipe_desc *spd, struct sock *sk) { int seg; @@ -1508,7 +1500,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, 1, sk)) + offset, len, skb, spd, 1, sk, pipe)) return 1; /* @@ -1518,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0, sk)) + offset, len, skb, spd, 0, sk, pipe)) return 1; } @@ -1535,8 +1527,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -1546,12 +1538,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, }; struct sk_buff *frag_iter; struct sock *sk = skb->sk; + int ret = 0; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) goto done; else if (!tlen) goto done; @@ -1562,14 +1558,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, skb_walk_frags(skb, frag_iter) { if (!tlen) break; - if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) break; } done: if (spd.nr_pages) { - int ret; - /* * Drop the socket lock, otherwise we have reverse * locking dependencies between sk_lock and i_mutex @@ -1582,10 +1576,10 @@ done: release_sock(sk); ret = splice_to_pipe(pipe, &spd); lock_sock(sk); - return ret; } - return 0; + splice_shrink_spd(pipe, &spd); + return ret; } /** @@ -1830,7 +1824,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) long csstart; if (skb->ip_summed == CHECKSUM_PARTIAL) - csstart = skb->csum_start - skb_headroom(skb); + csstart = skb_checksum_start_offset(skb); else csstart = skb_headlen(skb); @@ -2327,7 +2321,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2490,7 +2484,6 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) skb_postpull_rcsum(skb, skb->data, len); return skb->data += len; } - EXPORT_SYMBOL_GPL(skb_pull_rcsum); /** @@ -2578,6 +2571,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; + /* nskb and skb might have different headroom */ + if (nskb->ip_summed == CHECKSUM_PARTIAL) + nskb->csum_start += skb_headroom(nskb) - headroom; + skb_reset_mac_header(nskb); skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + @@ -2704,10 +2701,11 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = 1; goto done; - } + } else if (skb_gro_len(p) != pinfo->gso_size) + return -E2BIG; headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM; @@ -2728,6 +2726,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; + pinfo->gso_size = 0; skb_header_release(p); nskb->prev = p; @@ -2892,7 +2891,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_has_frags(skb)) { + if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2927,7 +2926,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1) || + skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2936,7 +2935,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1)) { + skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ @@ -2970,6 +2969,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +static void sock_rmem_free(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + +/* + * Note: We dont mem charge error packets (no sk_forward_alloc changes) + */ +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + return -ENOMEM; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_rmem_free; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + skb_queue_tail(&sk->sk_error_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + return 0; +} +EXPORT_SYMBOL(sock_queue_err_skb); + void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { @@ -2991,7 +3018,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } else { /* * no hardware time stamps available, - * so keep the skb_shared_tx and only + * so keep the shared tx_flags and only * store software time stamp */ skb->tstamp = ktime_get_real(); @@ -3001,7 +3028,9 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + err = sock_queue_err_skb(sk, skb); + if (err) kfree_skb(skb); } diff --git a/net/core/sock.c b/net/core/sock.c index b0ba569bc97..a658aeb6d55 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -110,6 +110,7 @@ #include <linux/tcp.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -123,6 +124,7 @@ #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> +#include <net/cls_cgroup.h> #include <linux/filter.h> @@ -142,7 +144,7 @@ static struct lock_class_key af_family_slock_keys[AF_MAX]; * strings build-time, so that runtime initialization of socket * locks is fast): */ -static const char *af_family_key_strings[AF_MAX+1] = { +static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , @@ -155,10 +157,10 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , - "sk_lock-AF_IEEE802154", + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_MAX" }; -static const char *af_family_slock_key_strings[AF_MAX+1] = { +static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , @@ -171,10 +173,10 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , - "slock-AF_IEEE802154", + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_MAX" }; -static const char *af_family_clock_key_strings[AF_MAX+1] = { +static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , @@ -187,7 +189,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , - "clock-AF_IEEE802154", + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_MAX" }; @@ -217,6 +219,11 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +int net_cls_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_cls_subsys_id); +#endif + static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { struct timeval tv; @@ -274,25 +281,27 @@ static void sock_disable_timestamp(struct sock *sk, int flag) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err = 0; + int err; int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOMEM; } err = sk_filter(sk, skb); if (err) - goto out; + return err; if (!sk_rmem_schedule(sk, skb->truesize)) { - err = -ENOBUFS; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOBUFS; } skb->dev = NULL; @@ -305,12 +314,19 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len); -out: - return err; + return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -323,6 +339,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } if (nested) bh_lock_sock_nested(sk); else @@ -336,8 +356,12 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else - sk_add_backlog(sk, skb); + } else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + bh_unlock_sock(sk); out: sock_put(sk); @@ -348,12 +372,19 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); +void sk_reset_txq(struct sock *sk) +{ + sk_tx_queue_clear(sk); +} +EXPORT_SYMBOL(sk_reset_txq); + struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst = sk->sk_dst_cache; + struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - sk->sk_dst_cache = NULL; + sk_tx_queue_clear(sk); + rcu_assign_pointer(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } @@ -406,17 +437,18 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (copy_from_user(devname, optval, optlen)) goto out; - if (devname[0] == '\0') { - index = 0; - } else { - struct net_device *dev = dev_get_by_name(net, devname); + index = 0; + if (devname[0] != '\0') { + struct net_device *dev; + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; - - index = dev->ifindex; - dev_put(dev); } lock_sock(sk); @@ -446,7 +478,7 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; int val; @@ -482,6 +514,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_reuse = valbool; break; case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: case SO_ERROR: ret = -ENOPROTOOPT; break; @@ -631,7 +665,7 @@ set_rcvbuf: case SO_TIMESTAMPING: if (val & ~SOF_TIMESTAMPING_MASK) { - ret = EINVAL; + ret = -EINVAL; break; } sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, @@ -700,6 +734,12 @@ set_rcvbuf: /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + if (valbool) + sock_set_flag(sk, SOCK_RXQ_OVFL); + else + sock_reset_flag(sk, SOCK_RXQ_OVFL); + break; default: ret = -ENOPROTOOPT; break; @@ -710,6 +750,20 @@ set_rcvbuf: EXPORT_SYMBOL(sock_setsockopt); +void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) +{ + ucred->pid = pid_vnr(pid); + ucred->uid = ucred->gid = -1; + if (cred) { + struct user_namespace *current_ns = current_user_ns(); + + ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid); + ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid); + } +} +EXPORT_SYMBOL_GPL(cred_to_ucred); + int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -721,7 +775,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct timeval tm; } v; - unsigned int lv = sizeof(int); + int lv = sizeof(int); int len; if (get_user(len, optlen)) @@ -764,6 +818,14 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_type; break; + case SO_PROTOCOL: + v.val = sk->sk_protocol; + break; + + case SO_DOMAIN: + v.val = sk->sk_family; + break; + case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) @@ -854,11 +916,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEERCRED: - if (len > sizeof(sk->sk_peercred)) - len = sizeof(sk->sk_peercred); - if (copy_to_user(optval, &sk->sk_peercred, len)) + { + struct ucred peercred; + if (len > sizeof(peercred)) + len = sizeof(peercred); + cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; + } case SO_PEERNAME: { @@ -891,6 +957,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_mark; break; + case SO_RXQ_OVFL: + v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); + break; + default: return -ENOPROTOOPT; } @@ -919,19 +989,57 @@ static inline void sock_lock_init(struct sock *sk) af_family_keys + sk->sk_family); } +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end + */ static void sock_copy(struct sock *nsk, const struct sock *osk) { #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); - memcpy(nsk, osk, osk->sk_prot->obj_size); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } +/* + * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * un-modified. Special care is taken when initializing object to zero. + */ +static inline void sk_prot_clear_nulls(struct sock *sk, int size) +{ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + size - offsetof(struct sock, sk_node.pprev)); +} + +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ + unsigned long nulls1, nulls2; + + nulls1 = offsetof(struct sock, __sk_common.skc_node.next); + nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); + if (nulls1 > nulls2) + swap(nulls1, nulls2); + + if (nulls1 != 0) + memset((char *)sk, 0, nulls1); + memset((char *)sk + nulls1 + sizeof(void *), 0, + nulls2 - nulls1 - sizeof(void *)); + memset((char *)sk + nulls2 + sizeof(void *), 0, + size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); + static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -939,9 +1047,17 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, struct kmem_cache *slab; slab = prot->slab; - if (slab != NULL) - sk = kmem_cache_alloc(slab, priority); - else + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (priority & __GFP_ZERO) { + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); + } + } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { @@ -952,6 +1068,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; + sk_tx_queue_clear(sk); } return sk; @@ -982,6 +1099,20 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } +#ifdef CONFIG_CGROUPS +void sock_update_classid(struct sock *sk) +{ + u32 classid; + + rcu_read_lock(); /* doing current task, which cannot vanish. */ + classid = task_cls_classid(current); + rcu_read_unlock(); + if (classid && classid != sk->sk_classid) + sk->sk_classid = classid; +} +EXPORT_SYMBOL(sock_update_classid); +#endif + /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1004,6 +1135,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); sock_net_set(sk, get_net(net)); + atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); } return sk; @@ -1017,7 +1151,8 @@ static void __sk_free(struct sock *sk) if (sk->sk_destruct) sk->sk_destruct(sk); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_check(sk->sk_filter, + atomic_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); rcu_assign_pointer(sk->sk_filter, NULL); @@ -1030,6 +1165,9 @@ static void __sk_free(struct sock *sk) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", __func__, atomic_read(&sk->sk_omem_alloc)); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } @@ -1082,6 +1220,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* @@ -1095,7 +1234,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_async_wait_queue); #endif - rwlock_init(&newsk->sk_dst_lock); + spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1110,7 +1249,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = newsk->sk_filter; + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1125,6 +1264,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&newsk->sk_refcnt, 2); /* @@ -1140,10 +1284,14 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); - newsk->sk_sleep = NULL; + newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); + + if (sock_flag(newsk, SOCK_TIMESTAMP) || + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + net_enable_timestamp(); } out: return newsk; @@ -1156,6 +1304,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + sk->sk_route_caps &= ~sk->sk_route_nocaps; if (sk_can_gso(sk)) { if (dst->header_len) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; @@ -1169,12 +1318,12 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); void __init sk_init(void) { - if (num_physpages <= 4096) { + if (totalram_pages <= 4096) { sysctl_wmem_max = 32767; sysctl_rmem_max = 32767; sysctl_wmem_default = 32767; sysctl_rmem_default = 32767; - } else if (num_physpages >= 131072) { + } else if (totalram_pages >= 131072) { sysctl_wmem_max = 131071; sysctl_rmem_max = 131071; } @@ -1191,17 +1340,22 @@ void __init sk_init(void) void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - int res; + unsigned int len = skb->truesize; - /* In case it might be waiting for more memory. */ - res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc); - if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + /* + * Keep a reference on sk_wmem_alloc, this will be released + * after sk_write_space() call + */ + atomic_sub(len - 1, &sk->sk_wmem_alloc); sk->sk_write_space(sk); + len = 1; + } /* - * if sk_wmem_alloc reached 0, we are last user and should - * free this sock, as sk_free() call could not do it. + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets */ - if (res == 0) + if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); @@ -1212,9 +1366,10 @@ EXPORT_SYMBOL(sock_wfree); void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; + unsigned int len = skb->truesize; - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - sk_mem_uncharge(skb->sk, skb->truesize); + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); @@ -1223,9 +1378,9 @@ int sock_i_uid(struct sock *sk) { int uid; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); @@ -1234,9 +1389,9 @@ unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(sock_i_ino); @@ -1319,7 +1474,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) break; if (sk->sk_shutdown & SEND_SHUTDOWN) @@ -1328,7 +1483,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; timeo = schedule_timeout(timeo); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return timeo; } @@ -1429,6 +1584,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, EXPORT_SYMBOL(sock_alloc_send_skb); static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); @@ -1445,6 +1602,8 @@ static void __lock_sock(struct sock *sk) } static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { struct sk_buff *skb = sk->sk_backlog.head; @@ -1455,6 +1614,7 @@ static void __release_sock(struct sock *sk) do { struct sk_buff *next = skb->next; + WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); @@ -1471,6 +1631,12 @@ static void __release_sock(struct sock *sk) bh_lock_sock(sk); } while ((skb = sk->sk_backlog.head) != NULL); + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; } /** @@ -1488,11 +1654,11 @@ int sk_wait_data(struct sock *sk, long *timeo) int rc; DEFINE_WAIT(wait); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); @@ -1511,10 +1677,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1572,7 +1738,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1585,12 +1751,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -1660,7 +1826,7 @@ int sock_no_shutdown(struct socket *sock, int how) EXPORT_SYMBOL(sock_no_shutdown); int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -1714,41 +1880,53 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_all(sk->sk_sleep); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_poll(sk->sk_sleep, POLLERR); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_readable(struct sock *sk, int len) { - read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ @@ -1756,7 +1934,7 @@ static void sock_def_write_space(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) @@ -1810,12 +1988,12 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - sk->sk_sleep = &sock->wait; + sk->sk_wq = sock->wq; sock->sk = sk; } else - sk->sk_sleep = NULL; + sk->sk_wq = NULL; - rwlock_init(&sk->sk_dst_lock); + spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, @@ -1830,9 +2008,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - sk->sk_peercred.pid = 0; - sk->sk_peercred.uid = -1; - sk->sk_peercred.gid = -1; + sk->sk_peer_pid = NULL; + sk->sk_peer_cred = NULL; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; @@ -1840,8 +2017,12 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_wmem_alloc, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data); @@ -1879,6 +2060,39 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken + * sk_lock.slock locked, owned = 0, BH disabled + * return true if slow path is taken + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +bool lock_sock_fast(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sk->sk_lock.owned) + /* + * Note : We must disable BH + */ + return false; + + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + local_bh_enable(); + return true; +} +EXPORT_SYMBOL(lock_sock_fast); + int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { struct timeval tv; @@ -1977,7 +2191,7 @@ EXPORT_SYMBOL(sock_common_recvmsg); * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -1987,7 +2201,7 @@ EXPORT_SYMBOL(sock_common_setsockopt); #ifdef CONFIG_COMPAT int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -2049,8 +2263,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); #ifdef CONFIG_NET_NS void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - int cpu = smp_processor_id(); - per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; + __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -2066,13 +2279,13 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -static int sock_inuse_init_net(struct net *net) +static int __net_init sock_inuse_init_net(struct net *net) { net->core.inuse = alloc_percpu(struct prot_inuse); return net->core.inuse ? 0 : -ENOMEM; } -static void sock_inuse_exit_net(struct net *net) +static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.inuse); } @@ -2096,7 +2309,7 @@ static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; + __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -2154,13 +2367,10 @@ int proto_register(struct proto *prot, int alloc_slab) } if (prot->rsk_prot != NULL) { - static const char mask[] = "request_sock_%s"; - - prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; - sprintf(prot->rsk_prot->slab_name, mask, prot->name); prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -2173,14 +2383,11 @@ int proto_register(struct proto *prot, int alloc_slab) } if (prot->twsk_prot != NULL) { - static const char mask[] = "tw_sock_%s"; - - prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; - sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name); prot->twsk_prot->twsk_slab = kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, @@ -2207,7 +2414,8 @@ out_free_request_sock_slab: prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: - kfree(prot->rsk_prot->slab_name); + if (prot->rsk_prot) + kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; @@ -2268,12 +2476,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/core/stream.c b/net/core/stream.c index a37debfeb1b..f5df85dcd20 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -28,18 +28,21 @@ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; + struct socket_wq *wq; if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); - if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); } } - EXPORT_SYMBOL(sk_stream_write_space); /** @@ -66,18 +69,17 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return 0; } - EXPORT_SYMBOL(sk_stream_wait_connect); /** @@ -96,16 +98,15 @@ void sk_stream_wait_close(struct sock *sk, long timeout) DEFINE_WAIT(wait); do { - prepare_to_wait(sk->sk_sleep, &wait, + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) break; } while (!signal_pending(current) && timeout); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); } } - EXPORT_SYMBOL(sk_stream_wait_close); /** @@ -126,7 +127,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) while (1) { set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; @@ -140,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, !sk->sk_err && - !(sk->sk_shutdown & SEND_SHUTDOWN) && - sk_stream_memory_free(sk) && - vm_wait); + sk_wait_event(sk, ¤t_timeo, sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && + !vm_wait)); sk->sk_write_pending--; if (vm_wait) { @@ -157,7 +158,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; do_error: @@ -170,7 +171,6 @@ do_interrupted: err = sock_intr_errno(*timeo_p); goto out; } - EXPORT_SYMBOL(sk_stream_wait_memory); int sk_stream_error(struct sock *sk, int flags, int err) @@ -181,7 +181,6 @@ int sk_stream_error(struct sock *sk, int flags, int err) send_sig(SIGPIPE, current, 0); return err; } - EXPORT_SYMBOL(sk_stream_error); void sk_stream_kill_queues(struct sock *sk) @@ -206,5 +205,4 @@ void sk_stream_kill_queues(struct sock *sk) * have gone away, only the net layer knows can touch it. */ } - EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7db1de0497c..385b6095fdc 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -10,14 +10,77 @@ #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> +#include <linux/ratelimit.h> +#include <linux/vmalloc.h> #include <linux/init.h> +#include <linux/slab.h> + #include <net/ip.h> #include <net/sock.h> +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<30) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + synchronize_rcu(); + vfree(orig_sock_table); + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { - .ctl_name = NET_CORE_WMEM_MAX, .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), @@ -25,7 +88,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_RMEM_MAX, .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), @@ -33,7 +95,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_WMEM_DEFAULT, .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), @@ -41,7 +102,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_RMEM_DEFAULT, .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), @@ -49,7 +109,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_DEV_WEIGHT, .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), @@ -57,7 +116,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_MAX_BACKLOG, .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), @@ -65,16 +123,20 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_MSG_COST, + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_CORE_MSG_BURST, .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), @@ -82,16 +144,22 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CORE_OPTMEM_MAX, .procname = "optmem_max", .data = &sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif #endif /* CONFIG_NET */ { - .ctl_name = NET_CORE_BUDGET, .procname = "netdev_budget", .data = &netdev_budget, .maxlen = sizeof(int), @@ -99,31 +167,29 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_CORE_WARNINGS, .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; static struct ctl_table netns_core_table[] = { { - .ctl_name = NET_CORE_SOMAXCONN, .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; __net_initdata struct ctl_path net_core_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "core", .ctl_name = NET_CORE, }, + { .procname = "net", }, + { .procname = "core", }, { }, }; @@ -134,7 +200,7 @@ static __net_init int sysctl_core_net_init(struct net *net) net->core.sysctl_somaxconn = SOMAXCONN; tbl = netns_core_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; diff --git a/net/core/timestamping.c b/net/core/timestamping.c new file mode 100644 index 00000000000..7e7ca375d43 --- /dev/null +++ b/net/core/timestamping.c @@ -0,0 +1,128 @@ +/* + * PTP 1588 clock support - support for timestamping in PHY devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/errqueue.h> +#include <linux/phy.h> +#include <linux/ptp_classify.h> +#include <linux/skbuff.h> + +static struct sock_filter ptp_filter[] = { + PTP_FILTER +}; + +static unsigned int classify(const struct sk_buff *skb) +{ + if (likely(skb->dev && + skb->dev->phydev && + skb->dev->phydev->drv)) + return sk_run_filter(skb, ptp_filter); + else + return PTP_CLASS_NONE; +} + +void skb_clone_tx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + struct sk_buff *clone; + struct sock *sk = skb->sk; + unsigned int type; + + if (!sk) + return; + + type = classify(skb); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + clone->sk = sk; + phydev->drv->txtstamp(phydev, clone, type); + } + break; + default: + break; + } +} + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err; + + if (!hwtstamps) + return; + + *skb_hwtstamps(skb) = *hwtstamps; + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + skb->sk = NULL; + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +bool skb_defer_rx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); + + type = classify(skb); + + __skb_pull(skb, ETH_HLEN); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); + break; + default: + break; + } + + return false; +} + +void __init skb_timestamping_init(void) +{ + BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); +} diff --git a/net/core/utils.c b/net/core/utils.c index 83221aee708..5fea0ab2190 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -24,6 +24,8 @@ #include <linux/types.h> #include <linux/percpu.h> #include <linux/init.h> +#include <linux/ratelimit.h> + #include <net/sock.h> #include <asm/byteorder.h> @@ -73,9 +75,8 @@ __be32 in_aton(const char *str) str++; } } - return(htonl(l)); + return htonl(l); } - EXPORT_SYMBOL(in_aton); #define IN6PTON_XDIGIT 0x00010000 @@ -91,18 +92,19 @@ EXPORT_SYMBOL(in_aton); static inline int xdigit2bin(char c, int delim) { + int val; + if (c == delim || c == '\0') return IN6PTON_DELIM; if (c == ':') return IN6PTON_COLON_MASK; if (c == '.') return IN6PTON_DOT; - if (c >= '0' && c <= '9') - return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); - if (c >= 'a' && c <= 'f') - return (IN6PTON_XDIGIT | (c - 'a' + 10)); - if (c >= 'A' && c <= 'F') - return (IN6PTON_XDIGIT | (c - 'A' + 10)); + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + if (delim == -1) return IN6PTON_DELIM; return IN6PTON_UNKNOWN; @@ -160,7 +162,6 @@ out: *end = s; return ret; } - EXPORT_SYMBOL(in4_pton); int in6_pton(const char *src, int srclen, @@ -278,7 +279,6 @@ out: *end = s; return ret; } - EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, |
