diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 7177 |
1 files changed, 5538 insertions, 1639 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 8d154159527..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -73,13 +73,15 @@ */ #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/bitops.h> -#include <linux/config.h> +#include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h> #include <linux/sched.h> +#include <linux/mutex.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> @@ -89,72 +91,75 @@ #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/ethtool.h> #include <linux/notifier.h> #include <linux/skbuff.h> +#include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/stat.h> -#include <linux/if_bridge.h> -#include <linux/divert.h> #include <net/dst.h> #include <net/pkt_sched.h> #include <net/checksum.h> +#include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> -#include <linux/kmod.h> #include <linux/module.h> -#include <linux/kallsyms.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> -#ifdef CONFIG_NET_RADIO -#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ #include <net/iw_handler.h> -#endif /* CONFIG_NET_RADIO */ #include <asm/current.h> - -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversly affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ +#include <linux/audit.h> +#include <linux/dmaengine.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> +#include <linux/pci.h> +#include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> +#include <linux/static_key.h> +#include <linux/hashtable.h> +#include <linux/vmalloc.h> +#include <linux/if_macvlan.h> + +#include "net-sysfs.h" + +/* Instead of increasing this, you should create a hash table. */ +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[16]; /* 16 way hashed list */ -static struct list_head ptype_all; /* Taps */ +static DEFINE_SPINLOCK(offload_lock); +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly; /* Taps */ +static struct list_head offload_base __read_mostly; + +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); /* - * The @dev_base list is protected by @dev_base_lock and the rtln + * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the - * dev_base list, and hold dev_base_lock for writing when they do the + * dev_base_head list, and hold dev_base_lock for writing when they do the * actual updates. This allows pure readers to access the list even * while a writer is preparing to update it. * @@ -166,51 +171,178 @@ static struct list_head ptype_all; /* Taps */ * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ -struct net_device *dev_base; -static struct net_device **dev_tail = &dev_base; DEFINE_RWLOCK(dev_base_lock); - -EXPORT_SYMBOL(dev_base); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; -static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); -static inline struct hlist_head *dev_name_hash(const char *name) +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + +static seqcount_t devnet_rename_seq; + +static inline void dev_base_seq_inc(struct net *net) { - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; + while (++net->dev_base_seq == 0); } -static inline struct hlist_head *dev_index_hash(int ifindex) +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { - return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; + unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_lock(&sd->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_unlock(&sd->input_pkt_queue.lock); +#endif +} + +/* Device list insertion */ +static void list_netdevice(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + ASSERT_RTNL(); + + write_lock_bh(&dev_base_lock); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + + dev_base_seq_inc(net); +} + +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ +static void unlist_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + /* Unlink dev from the device chain */ + write_lock_bh(&dev_base_lock); + list_del_rcu(&dev->dev_list); + hlist_del_rcu(&dev->name_hlist); + hlist_del_rcu(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); + + dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ -static struct notifier_block *netdev_chain; +static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; -#ifdef CONFIG_SYSFS -extern int netdev_sysfs_init(void); -extern int netdev_register_sysfs(struct net_device *); -extern void netdev_unregister_sysfs(struct net_device *); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data); + +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = + {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = + {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} #else -#define netdev_sysfs_init() (0) -#define netdev_register_sysfs(dev) (0) -#define netdev_unregister_sysfs(dev) do { } while(0) +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} #endif - /******************************************************************************* Protocol management and registration routines @@ -218,12 +350,6 @@ extern void netdev_unregister_sysfs(struct net_device *); *******************************************************************************/ /* - * For efficiency - */ - -int netdev_nit; - -/* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. @@ -239,6 +365,14 @@ int netdev_nit; * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -247,25 +381,20 @@ int netdev_nit; * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * - * This call does not sleep therefore it can not + * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit++; - list_add_rcu(&pt->list, &ptype_all); - } else { - hash = ntohs(pt->type) & 15; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } +EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler @@ -274,7 +403,7 @@ void dev_add_pack(struct packet_type *pt) * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function - * returns. + * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone @@ -282,16 +411,10 @@ void dev_add_pack(struct packet_type *pt) */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit--; - head = &ptype_all; - } else - head = &ptype_base[ntohs(pt->type) & 15]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -300,10 +423,12 @@ void __dev_remove_pack(struct packet_type *pt) } } - printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); + pr_warn("dev_remove_pack: %p not found\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } +EXPORT_SYMBOL(__dev_remove_pack); + /** * dev_remove_pack - remove packet handler * @pt: packet type declaration @@ -319,9 +444,85 @@ out: void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); - + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_pack); + + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + + spin_lock(&offload_lock); + list_add_rcu(&po->list, head); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + synchronize_net(); } +EXPORT_SYMBOL(dev_remove_offload); /****************************************************************************** @@ -350,7 +551,7 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map) for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { memset(s[i].name, 0, sizeof(s[i].name)); - strcpy(s[i].name, name); + strlcpy(s[i].name, name, IFNAMSIZ); memcpy(&s[i].map, map, sizeof(s[i].map)); break; } @@ -375,7 +576,7 @@ int netdev_boot_setup_check(struct net_device *dev) for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strncmp(dev->name, s[i].name, strlen(s[i].name))) { + !strcmp(dev->name, s[i].name)) { dev->irq = s[i].map.irq; dev->base_addr = s[i].map.base_addr; dev->mem_start = s[i].map.mem_start; @@ -385,6 +586,7 @@ int netdev_boot_setup_check(struct net_device *dev) } return 0; } +EXPORT_SYMBOL(netdev_boot_setup_check); /** @@ -409,7 +611,7 @@ unsigned long netdev_boot_base(const char *prefix, int unit) * If device already registered then return base of 1 * to indicate not to probe for this interface */ - if (__dev_get_by_name(name)) + if (__dev_get_by_name(&init_net, name)) return 1; for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) @@ -455,6 +657,7 @@ __setup("netdev=", netdev_boot_setup); /** * __dev_get_by_name - find a device by its name + * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore @@ -464,21 +667,47 @@ __setup("netdev=", netdev_boot_setup); * careful with locks. */ -struct net_device *__dev_get_by_name(const char *name) +struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each(p, dev_name_hash(name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); + hlist_for_each_entry(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; - } + return NULL; } +EXPORT_SYMBOL(__dev_get_by_name); + +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); /** * dev_get_by_name - find a device by its name + * @net: the applicable net namespace * @name: name to find * * Find an interface by name. This can be called from any @@ -488,20 +717,22 @@ struct net_device *__dev_get_by_name(const char *name) * matching device is found. */ -struct net_device *dev_get_by_name(const char *name) +struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } +EXPORT_SYMBOL(dev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device @@ -511,22 +742,47 @@ struct net_device *dev_get_by_name(const char *name) * or @dev_base_lock. */ -struct net_device *__dev_get_by_index(int ifindex) +struct net_device *__dev_get_by_index(struct net *net, int ifindex) { - struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each(p, dev_index_hash(ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); + hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; - } + return NULL; } +EXPORT_SYMBOL(__dev_get_by_index); + +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); /** * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns NULL if the device @@ -535,124 +791,184 @@ struct net_device *__dev_get_by_index(int ifindex) * dev_put to indicate they have finished with it. */ -struct net_device *dev_get_by_index(int ifindex) +struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_getbyhwaddr - find a device by its hardware address + * netdev_get_name - get a netdevice name, knowing its ifindex. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @ifindex: the ifindex of the interface to get the name from. + * + * The use of raw_seqcount_begin() and cond_resched() before + * retrying is required as we want to give the writers a chance + * to complete when CONFIG_PREEMPT is not set. + */ +int netdev_get_name(struct net *net, char *name, int ifindex) +{ + struct net_device *dev; + unsigned int seq; + +retry: + seq = raw_seqcount_begin(&devnet_rename_seq); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(name, dev->name); + rcu_read_unlock(); + if (read_seqcount_retry(&devnet_rename_seq, seq)) { + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * dev_getbyhwaddr_rcu - find a device by its hardware address + * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; - ASSERT_RTNL(); - - for (dev = dev_base; dev; dev = dev->next) + for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) - break; - return dev; -} + return dev; -EXPORT_SYMBOL(dev_getbyhwaddr); + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *dev_getfirstbyhwtype(unsigned short type) +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev; - rtnl_lock(); - for (dev = dev_base; dev; dev = dev->next) { + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev->type == type) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev, *ret = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); + ret = dev; break; } - } - rtnl_unlock(); - return dev; + rcu_read_unlock(); + return ret; } - EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags - find any device with given flags + * dev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * is not found or a pointer to the device. Must be called inside + * rcu_read_lock(), and result refcount is unchanged. */ -struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, + unsigned short mask) { - struct net_device *dev; + struct net_device *dev, *ret; - read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + ret = NULL; + for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { - dev_hold(dev); + ret = dev; break; } } - read_unlock(&dev_base_lock); - return dev; + return ret; } +EXPORT_SYMBOL(dev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work + * to allow sysfs to work. We also disallow any kind of + * whitespace. */ -static int dev_valid_name(const char *name) +bool dev_valid_name(const char *name) { - return !(*name == '\0' - || !strcmp(name, ".") - || !strcmp(name, "..") - || strchr(name, '/')); + if (*name == '\0') + return false; + if (strlen(name) >= IFNAMSIZ) + return false; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return false; + + while (*name) { + if (*name == '/' || isspace(*name)) + return false; + name++; + } + return true; } +EXPORT_SYMBOL(dev_valid_name); /** - * dev_alloc_name - allocate a name for a device - * @dev: device + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in * @name: name format string + * @buf: scratch buffer and result name string * * Passed a format string - eg "lt%d" it will try and find a suitable - * id. Not efficient for many devices, not called a lot. The caller - * must hold the dev_base or rtnl lock while allocating the name and - * adding the device in order to avoid duplicates. Returns the number - * of the unit assigned or a negative errno code. + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. */ -int dev_alloc_name(struct net_device *dev, const char *name) +static int __dev_alloc_name(struct net *net, const char *name, char *buf) { int i = 0; - char buf[IFNAMSIZ]; const char *p; const int max_netdevices = 8*PAGE_SIZE; - long *inuse; + unsigned long *inuse; struct net_device *d; p = strnchr(name, IFNAMSIZ-1, '%'); @@ -666,18 +982,18 @@ int dev_alloc_name(struct net_device *dev, const char *name) return -EINVAL; /* Use one page as a bit array of possible slots */ - inuse = (long *) get_zeroed_page(GFP_ATOMIC); + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); if (!inuse) return -ENOMEM; - for (d = dev_base; d; d = d->next) { + for_each_netdev(net, d) { if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, sizeof(buf), name, i); + snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) set_bit(i, inuse); } @@ -686,11 +1002,10 @@ int dev_alloc_name(struct net_device *dev, const char *name) free_page((unsigned long) inuse); } - snprintf(buf, sizeof(buf), name, i); - if (!__dev_get_by_name(buf)) { - strlcpy(dev->name, buf, IFNAMSIZ); + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); + if (!__dev_get_by_name(net, buf)) return i; - } /* It is possible to run out of possible slots * when the name is long and there isn't enough space left @@ -699,6 +1014,66 @@ int dev_alloc_name(struct net_device *dev, const char *name) return -ENFILE; } +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + char buf[IFNAMSIZ]; + struct net *net; + int ret; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} +EXPORT_SYMBOL(dev_alloc_name); + +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) +{ + char buf[IFNAMSIZ]; + int ret; + + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +static int dev_get_valid_name(struct net *net, + struct net_device *dev, + const char *name) +{ + BUG_ON(!net); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name_ns(net, dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; +} /** * dev_change_name - change name of a device @@ -708,48 +1083,119 @@ int dev_alloc_name(struct net_device *dev, const char *name) * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ -int dev_change_name(struct net_device *dev, char *newname) +int dev_change_name(struct net_device *dev, const char *newname) { + char oldname[IFNAMSIZ]; int err = 0; + int ret; + struct net *net; ASSERT_RTNL(); + BUG_ON(!dev_net(dev)); + net = dev_net(dev); if (dev->flags & IFF_UP) return -EBUSY; - if (!dev_valid_name(newname)) - return -EINVAL; + write_seqcount_begin(&devnet_rename_seq); - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - strcpy(newname, dev->name); + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { + write_seqcount_end(&devnet_rename_seq); + return 0; } - else if (__dev_get_by_name(newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); - err = class_device_rename(&dev->class_dev, dev->name); - if (!err) { - hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); - notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + memcpy(oldname, dev->name, IFNAMSIZ); + + err = dev_get_valid_name(net, dev, newname); + if (err < 0) { + write_seqcount_end(&devnet_rename_seq); + return err; + } + +rollback: + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + write_seqcount_end(&devnet_rename_seq); + return ret; + } + + write_seqcount_end(&devnet_rename_seq); + + netdev_adjacent_rename_links(dev, oldname); + + write_lock_bh(&dev_base_lock); + hlist_del_rcu(&dev->name_hlist); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); + + if (ret) { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { + err = ret; + write_seqcount_begin(&devnet_rename_seq); + memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); + goto rollback; + } else { + pr_err("%s: name change rollback failed: %d\n", + dev->name, ret); + } } return err; } /** - * netdev_features_change - device changes fatures + * dev_set_alias - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device, + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + char *new_ifalias; + + ASSERT_RTNL(); + + if (len >= IFALIASZ) + return -EINVAL; + + if (!len) { + kfree(dev->ifalias); + dev->ifalias = NULL; + return 0; + } + + new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!new_ifalias) + return -ENOMEM; + dev->ifalias = new_ifalias; + + strlcpy(dev->ifalias, alias, len+1); + return len; +} + + +/** + * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { - notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); @@ -764,40 +1210,77 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } +EXPORT_SYMBOL(netdev_state_change); /** - * dev_load - load a network module - * @name: name of interface + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. */ +void netdev_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netdev_notify_peers); -void dev_load(const char *name) +static int __dev_open(struct net_device *dev) { - struct net_device *dev; + const struct net_device_ops *ops = dev->netdev_ops; + int ret; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); - read_unlock(&dev_base_lock); + ASSERT_RTNL(); - if (!dev && capable(CAP_SYS_MODULE)) - request_module("%s", name); -} + if (!netif_device_present(dev)) + return -ENODEV; -static int default_rebuild_header(struct sk_buff *skb) -{ - printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", - skb->dev ? skb->dev->name : "NULL!!!"); - kfree_skb(skb); - return 1; -} + /* Block netpoll from trying to do any rx path servicing. + * If we don't do this there is a chance ndo_poll_controller + * or ndo_poll may be running while we open the device + */ + netpoll_poll_disable(dev); + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + set_bit(__LINK_STATE_START, &dev->state); + + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); + + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); + + netpoll_poll_enable(dev); + + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { + dev->flags |= IFF_UP; + net_dmaengine_get(); + dev_set_rx_mode(dev); + dev_activate(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); + } + + return ret; +} /** * dev_open - prepare an interface for use. @@ -813,57 +1296,99 @@ static int default_rebuild_header(struct sk_buff *skb) */ int dev_open(struct net_device *dev) { - int ret = 0; - - /* - * Is it already up? - */ + int ret; if (dev->flags & IFF_UP) return 0; - /* - * Is it even present? - */ - if (!netif_device_present(dev)) - return -ENODEV; + ret = __dev_open(dev); + if (ret < 0) + return ret; - /* - * Call device private open method - */ - set_bit(__LINK_STATE_START, &dev->state); - if (dev->open) { - ret = dev->open(dev); - if (ret) - clear_bit(__LINK_STATE_START, &dev->state); - } + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + call_netdevice_notifiers(NETDEV_UP, dev); - /* - * If it went open OK then: - */ + return ret; +} +EXPORT_SYMBOL(dev_open); - if (!ret) { - /* - * Set the flags. - */ - dev->flags |= IFF_UP; +static int __dev_close_many(struct list_head *head) +{ + struct net_device *dev; - /* - * Initialize multicasting status - */ - dev_mc_upload(dev); + ASSERT_RTNL(); + might_sleep(); - /* - * Wakeup transmit queue engine + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. */ - dev_activate(dev); + smp_mb__after_atomic(); /* Commit netif_running(). */ + } + + dev_deactivate_many(head); + + list_for_each_entry(dev, head, close_list) { + const struct net_device_ops *ops = dev->netdev_ops; /* - * ... and announce new interface. + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. */ - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + if (ops->ndo_stop) + ops->ndo_stop(dev); + + dev->flags &= ~IFF_UP; + net_dmaengine_put(); + netpoll_poll_enable(dev); } - return ret; + + return 0; +} + +static int __dev_close(struct net_device *dev) +{ + int retval; + LIST_HEAD(single); + + list_add(&dev->close_list, &single); + retval = __dev_close_many(&single); + list_del(&single); + + return retval; +} + +static int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) + if (!(dev->flags & IFF_UP)) + list_del_init(&dev->close_list); + + __dev_close_many(head); + + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + call_netdevice_notifiers(NETDEV_DOWN, dev); + list_del_init(&dev->close_list); + } + + return 0; } /** @@ -877,60 +1402,57 @@ int dev_open(struct net_device *dev) */ int dev_close(struct net_device *dev) { - if (!(dev->flags & IFF_UP)) - return 0; - - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ - notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); - - dev_deactivate(dev); - - clear_bit(__LINK_STATE_START, &dev->state); - - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(), - * and wait when poll really will happen. Actually, the best place - * for this is inside dev->stop() after device stopped its irq - * engine, but this requires more changes in devices. */ + if (dev->flags & IFF_UP) { + LIST_HEAD(single); - smp_mb__after_clear_bit(); /* Commit netif_running(). */ - while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { - /* No hurry. */ - msleep(1); + list_add(&dev->close_list, &single); + dev_close_many(&single); + list_del(&single); } + return 0; +} +EXPORT_SYMBOL(dev_close); - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (dev->stop) - dev->stop(dev); +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ /* - * Device is now down. + * If we're trying to disable lro on a vlan device + * use the underlying physical device instead */ + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); - dev->flags &= ~IFF_UP; + /* the same for macvlan devices */ + if (netif_is_macvlan(dev)) + dev = macvlan_dev_real_dev(dev); - /* - * Tell people we are down - */ - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); - return 0; + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); } +EXPORT_SYMBOL(dev_disable_lro); + +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ +static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block @@ -942,28 +1464,62 @@ int dev_close(struct net_device *dev) * is returned on a failure. * * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free + * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net_device *dev; + struct net_device *last; + struct net *net; int err; rtnl_lock(); - err = notifier_chain_register(&netdev_chain, nb); - if (!err) { - for (dev = dev_base; dev; dev = dev->next) { - nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = raw_notifier_chain_register(&netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + for_each_netdev(net, dev) { + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + goto rollback; - if (dev->flags & IFF_UP) - nb->notifier_call(nb, NETDEV_UP, dev); + if (!(dev->flags & IFF_UP)) + continue; + + call_netdevice_notifier(nb, NETDEV_UP, dev); } } + +unlock: rtnl_unlock(); return err; + +rollback: + last = dev; + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev == last) + goto outroll; + + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); + } + } + +outroll: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; } +EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block @@ -973,57 +1529,213 @@ int register_netdevice_notifier(struct notifier_block *nb) * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { - return notifier_chain_unregister(&netdev_chain, nb); + struct net_device *dev; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + if (err) + goto unlock; + + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); + } + } +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier); + +/** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function - * @v: pointer passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value - * are as for notifier_call_chain(). + * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers(unsigned long val, void *v) +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - return notifier_call_chain(&netdev_chain, val, v); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } +EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct static_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call static_key_slow_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * static_key_slow_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif void net_enable_timestamp(void) { - atomic_inc(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + + if (deferred) { + while (--deferred) + static_key_slow_dec(&netstamp_needed); + return; + } +#endif + static_key_slow_inc(&netstamp_needed); } +EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { - atomic_dec(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + if (in_interrupt()) { + atomic_inc(&netstamp_needed_deferred); + return; + } +#endif + static_key_slow_dec(&netstamp_needed); } +EXPORT_SYMBOL(net_disable_timestamp); -void __net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb) { - struct timeval tv; + skb->tstamp.tv64 = 0; + if (static_key_false(&netstamp_needed)) + __net_timestamp(skb); +} + +#define net_timestamp_check(COND, SKB) \ + if (static_key_false(&netstamp_needed)) { \ + if ((COND) && !(SKB)->tstamp.tv64) \ + __net_timestamp(SKB); \ + } \ + +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; - do_gettimeofday(&tv); - skb_set_timestamp(skb, &tv); + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; } -EXPORT_SYMBOL(__net_timestamp); +EXPORT_SYMBOL_GPL(is_skb_forwardable); -static inline void net_timestamp(struct sk_buff *skb) +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (atomic_read(&netstamp_needed)) - __net_timestamp(skb); - else { - skb->tstamp.off_sec = 0; - skb->tstamp.off_usec = 0; + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; } + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + return -ENOMEM; + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (!ptype->af_packet_priv || !skb->sk) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; } /* @@ -1031,11 +1743,11 @@ static inline void net_timestamp(struct sk_buff *skb) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - - net_timestamp(skb); + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1043,169 +1755,1086 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); + (!skb_loop_sk(ptype, skb))) { + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) break; + net_timestamp_set(skb2); + /* skb->nh should be correctly set by sender, so that the second statement is just protection against buggy protocols. */ - skb2->mac.raw = skb2->data; - - if (skb2->nh.raw < skb2->data || - skb2->nh.raw > skb2->tail) { - if (net_ratelimit()) - printk(KERN_CRIT "protocol %04x is " - "buggy, dev %s\n", - skb2->protocol, dev->name); - skb2->nh.raw = skb2->data; + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); } - skb2->h.raw = skb2->nh.raw; + skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype, skb->dev); + pt_prev = ptype; } } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, + int cpu, u16 index) +{ + struct xps_map *map = NULL; + int pos; + + if (dev_maps) + map = xmap_dereference(dev_maps->cpu_map[cpu]); + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] == index) { + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; + } else { + RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); + kfree_rcu(map, rcu); + map = NULL; + } + break; + } + } + + return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + struct xps_dev_maps *dev_maps; + int cpu, i; + bool active = false; + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (!dev_maps) + goto out_no_maps; + + for_each_possible_cpu(cpu) { + for (i = index; i < dev->num_tx_queues; i++) { + if (!remove_xps_queue(dev_maps, cpu, i)) + break; + } + if (i == dev->num_tx_queues) + active = true; + } + + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + + for (i = index; i < dev->num_tx_queues; i++) + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + +out_no_maps: + mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, + int cpu, u16 index) +{ + struct xps_map *new_map; + int alloc_len = XPS_MIN_MAP_ALLOC; + int i, pos; + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] != index) + continue; + return map; + } + + /* Need to add queue to this CPU's existing map */ + if (map) { + if (pos < map->alloc_len) + return map; + + alloc_len = map->alloc_len * 2; + } + + /* Need to allocate new map to store queue on this CPU's map */ + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + return NULL; + + for (i = 0; i < pos; i++) + new_map->queues[i] = map->queues[i]; + new_map->alloc_len = alloc_len; + new_map->len = pos; + + return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_map *map, *new_map; + int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); + int cpu, numa_node_id = -2; + bool active = false; + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + /* allocate memory for queue storage */ + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + + if (!new_dev_maps) + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + + map = expand_xps_map(map, cpu, index); + if (!map) + goto error; + + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + if (!new_dev_maps) + goto out_no_new_maps; + + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { + /* add queue to CPU maps */ + int pos = 0; + + map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + while ((pos < map->len) && (map->queues[pos] != index)) + pos++; + + if (pos == map->len) + map->queues[map->len++] = index; +#ifdef CONFIG_NUMA + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; +#endif + } else if (dev_maps) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[cpu]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + } + + } + + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + + /* Cleanup old maps */ + if (dev_maps) { + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (map && map != new_map) + kfree_rcu(map, rcu); + } + + kfree_rcu(dev_maps, rcu); + } + + dev_maps = new_dev_maps; + active = true; + +out_no_new_maps: + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? numa_node_id : + NUMA_NO_NODE); + + if (!dev_maps) + goto out_no_maps; + + /* removes queue from unused CPUs */ + for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) + continue; + + if (remove_xps_queue(dev_maps, cpu, index)) + active = true; + } + + /* free map if not active */ + if (!active) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + +out_no_maps: + mutex_unlock(&xps_map_mutex); + + return 0; +error: + /* remove any maps that we added */ + for_each_possible_cpu(cpu) { + new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } + + mutex_unlock(&xps_map_mutex); + + kfree(new_dev_maps); + return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + int rc; + + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + + if (txq < dev->real_num_tx_queues) { + qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); +#endif + } + } + + dev->real_num_tx_queues = txq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_SYSFS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it always + * succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + +/** + * netif_get_num_default_rss_queues - default number of RSS queues + * + * This routine should set an upper limit on the number of RSS queues + * used by default by multiqueue devices. + */ +int netif_get_num_default_rss_queues(void) +{ + return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); + +static inline void __netif_reschedule(struct Qdisc *q) +{ + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} + +void __netif_schedule(struct Qdisc *q) +{ + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) + __netif_reschedule(q); +} +EXPORT_SYMBOL(__netif_schedule); + +struct dev_kfree_skb_cb { + enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) +{ + return (struct dev_kfree_skb_cb *)skb->cb; +} + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ + unsigned long flags; + + if (likely(atomic_read(&skb->users) == 1)) { + smp_rmb(); + atomic_set(&skb->users, 0); + } else if (likely(!atomic_dec_and_test(&skb->users))) { + return; + } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__dev_kfree_skb_irq); + +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) +{ + if (in_irq() || irqs_disabled()) + __dev_kfree_skb_irq(skb, reason); + else + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(__dev_kfree_skb_any); + + +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_stop_all_queues(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_wake_all_queues(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + +static void skb_warn_bad_offload(const struct sk_buff *skb) +{ + static const netdev_features_t null_features = 0; + struct net_device *dev = skb->dev; + const char *driver = ""; + + if (!net_ratelimit()) + return; + + if (dev && dev->dev.parent) + driver = dev_driver_string(dev->dev.parent); + + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " + "gso_type=%d ip_summed=%d\n", + driver, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features, + skb->len, skb->data_len, skb_shinfo(skb)->gso_size, + skb_shinfo(skb)->gso_type, skb->ip_summed); +} + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ -int skb_checksum_help(struct sk_buff *skb, int inward) +int skb_checksum_help(struct sk_buff *skb) { - unsigned int csum; - int ret = 0, offset = skb->h.raw - skb->data; + __wsum csum; + int ret = 0, offset; - if (inward) { - skb->ip_summed = CHECKSUM_NONE; - goto out; + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)->gso_size)) { + skb_warn_bad_offload(skb); + return -EINVAL; } - if (skb_cloned(skb)) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); if (ret) goto out; } - if (offset > (int)skb->len) - BUG(); - csum = skb_checksum(skb, offset, skb->len-offset, 0); + offset = skb_checksum_start_offset(skb); + BUG_ON(offset >= skb_headlen(skb)); + csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset = skb->tail - skb->h.raw; - if (offset <= 0) - BUG(); - if (skb->csum + 2 > offset) - BUG(); + offset += skb->csum_offset; + BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); + + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__sum16))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } - *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum); +out_set_summed: skb->ip_summed = CHECKSUM_NONE; -out: +out: return ret; } +EXPORT_SYMBOL(skb_checksum_help); + +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) +{ + unsigned int vlan_depth = skb->mac_len; + __be16 type = skb->protocol; + + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; + + eth = (struct ethhdr *)skb_mac_header(skb); + type = eth->h_proto; + } + + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); + } + + *depth = vlan_depth; + + return type; +} + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + int err; + + err = ptype->callbacks.gso_send_check(skb); + segs = ERR_PTR(err); + if (err || skb_gso_ok(skb, features)) + break; + __skb_push(skb, (skb->data - + skb_network_header(skb))); + } + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL; + else + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + skb_warn_bad_offload(skb); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif -#ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. * 2. No high memory really exists on this machine. */ -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { +#ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) + return 1; + } + } + + if (PCI_DMA_BUS_IS_PHYS) { + struct device *pdev = dev->dev.parent; + + if (!pdev) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + dma_addr_t addr = page_to_phys(skb_frag_page(frag)); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + return 1; + } + } +#endif + return 0; +} + +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + kfree_skb_list(skb->next); + skb->next = NULL; + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * @features: device features as applicable to this skb + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) +{ + struct sk_buff *segs; - if (dev->features & NETIF_F_HIGHDMA) + segs = skb_gso_segment(skb, features); + + /* Verifying header integrity only. */ + if (!segs) return 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) - return 1; + if (IS_ERR(segs)) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; return 0; } + +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + features &= skb->dev->mpls_features; + + return features; +} #else -#define illegal_highdma(dev, skb) (0) +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} #endif -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) +static netdev_features_t harmonize_features(struct sk_buff *skb, + netdev_features_t features) { - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; + int tmp; + __be16 type; - if (skb_shared(skb)) - BUG(); + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); - if (expand <= 0) - expand = 0; + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, type)) { + features &= ~NETIF_F_ALL_CSUM; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; + return features; +} - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); +netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + netdev_features_t features = skb->dev->features; - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; - /* Offset between the two in bytes */ - offset = data - skb->head; + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, features); + } - /* Free old data. */ - skb_release_data(skb); + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); - skb->head = data; - skb->end = data + size; + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; + return harmonize_features(skb, features); +} +EXPORT_SYMBOL(netif_skb_features); - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int rc = NETDEV_TX_OK; + unsigned int skb_len; - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; + if (likely(!skb->next)) { + netdev_features_t features; + + /* + * If device doesn't need skb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } + } + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + skb_len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc, dev, skb_len); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + return rc; + } + +gso: + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(nskb, dev); + + skb_len = nskb->len; + trace_net_dev_start_xmit(nskb, dev); + rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_xmit(nskb, rc, dev, skb_len); + if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; + nskb->next = skb->next; + skb->next = nskb; + return rc; + } + txq_trans_update(txq); + if (unlikely(netif_xmit_stopped(txq) && skb->next)) + return NETDEV_TX_BUSY; + } while (skb->next); + +out_kfree_gso_skb: + if (likely(skb->next == NULL)) { + skb->destructor = DEV_GSO_CB(skb)->destructor; + consume_skb(skb); + return rc; + } +out_kfree_skb: + kfree_skb(skb); +out: + return rc; } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit); -#define HARD_TX_LOCK(dev, cpu) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock(&dev->xmit_lock); \ - dev->xmit_lock_owner = cpu; \ - } \ +static void qdisc_pkt_len_init(struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + + qdisc_skb_cb(skb)->pkt_len = skb->len; + + /* To get more precise estimation of bytes sent on wire, + * we add to pkt_len the headers size of all segments + */ + if (shinfo->gso_size) { + unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; + + /* mac layer + network layer */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len += tcp_hdrlen(skb); + else + hdr_len += sizeof(struct udphdr); + + if (shinfo->gso_type & SKB_GSO_DODGY) + gso_segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + + qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; + } } -#define HARD_TX_UNLOCK(dev) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - dev->xmit_lock_owner = -1; \ - spin_unlock(&dev->xmit_lock); \ - } \ +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, + struct netdev_queue *txq) +{ + spinlock_t *root_lock = qdisc_lock(q); + bool contended; + int rc; + + qdisc_pkt_len_init(skb); + qdisc_calculate_pkt_len(skb, q); + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits __QDISC_STATE_RUNNING owner to get the lock more often + * and dequeue packets faster. + */ + contended = qdisc_is_running(q); + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + kfree_skb(skb); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + qdisc_run_begin(q)) { + /* + * This is a work-conserving queue; there are no old skbs + * waiting to be sent out; and the qdisc is not running - + * xmit the skb directly. + */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); + + qdisc_bstats_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } else + qdisc_run_end(q); + + rc = NET_XMIT_SUCCESS; + } else { + skb_dst_force(skb); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } + } + spin_unlock(root_lock); + if (unlikely(contended)) + spin_unlock(&q->busylock); + return rc; } +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +static void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + + if (!skb->priority && skb->sk && map) { + unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; + } +} +#else +#define skb_update_prio(skb) +#endif + +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 + /** - * dev_queue_xmit - transmit a buffer + * dev_loopback_xmit - loop back @skb * @skb: buffer to transmit + */ +int dev_loopback_xmit(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->pkt_type = PACKET_LOOPBACK; + skb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(skb)); + skb_dst_force(skb); + netif_rx_ni(skb); + return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + +/** + * __dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * @accel_priv: private data used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -1228,78 +2857,39 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ - -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; + struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; - if (skb_shinfo(skb)->frag_list && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) - goto out_kfree_skb; + skb_reset_mac_header(skb); - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) - goto out_kfree_skb; + rcu_read_lock_bh(); - /* If packet is not checksummed and device does not support - * checksumming for this protocol, complete checksumming here. - */ - if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && - (!(dev->features & NETIF_F_IP_CSUM) || - skb->protocol != htons(ETH_P_IP)))) - if (skb_checksum_help(skb, 0)) - goto out_kfree_skb; + skb_update_prio(skb); - spin_lock_prefetch(&dev->queue_lock); + txq = netdev_pick_tx(dev, skb, accel_priv); + q = rcu_dereference_bh(txq->qdisc); - /* Disable soft irqs for various locks below. Also - * stops preemption for RCU. - */ - local_bh_disable(); - - /* Updates of qdisc are serialized by queue_lock. - * The struct Qdisc which is pointed to by qdisc is now a - * rcu structure - it may be accessed without acquiring - * a lock (but the structure may be stale.) The freeing of the - * qdisc will be deferred until it's known that there are no - * more references to it. - * - * If the qdisc has an enqueue function, we still need to - * hold the queue_lock before calling it, since queue_lock - * also serializes access to the device queue. - */ - - q = rcu_dereference(dev->qdisc); #ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif + trace_net_dev_queue(skb); if (q->enqueue) { - /* Grab device queue */ - spin_lock(&dev->queue_lock); - - rc = q->enqueue(skb, q); - - qdisc_run(dev); - - spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... - Really, it is unlikely that xmit_lock protection is necessary here. - (f.e. loopback and IP tunnels are clean ignoring statistics + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. @@ -1310,139 +2900,459 @@ int dev_queue_xmit(struct sk_buff *skb) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (dev->xmit_lock_owner != cpu) { + if (txq->xmit_lock_owner != cpu) { - HARD_TX_LOCK(dev, cpu); + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; - if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); + HARD_TX_LOCK(dev, txq, cpu); - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { - HARD_TX_UNLOCK(dev); + if (!netif_xmit_stopped(txq)) { + __this_cpu_inc(xmit_recursion); + rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); + if (dev_xmit_complete(rc)) { + HARD_TX_UNLOCK(dev, txq); goto out; } } - HARD_TX_UNLOCK(dev); - if (net_ratelimit()) - printk(KERN_CRIT "Virtual device %s asks to " - "queue packet!\n", dev->name); + HARD_TX_UNLOCK(dev, txq); + net_crit_ratelimited("Virtual device %s asks to queue packet!\n", + dev->name); } else { /* Recursion is detected! It is possible, - * unfortunately */ - if (net_ratelimit()) - printk(KERN_CRIT "Dead loop on virtual device " - "%s, fix it urgently!\n", dev->name); + * unfortunately + */ +recursion_alert: + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); } } rc = -ENETDOWN; - local_bh_enable(); + rcu_read_unlock_bh(); -out_kfree_skb: + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return rc; out: - local_bh_enable(); + rcu_read_unlock_bh(); return rc; } +int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} +EXPORT_SYMBOL(dev_queue_xmit); + +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ + return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); + /*======================================================================= Receiver routines =======================================================================*/ -int netdev_max_backlog = 1000; -int netdev_budget = 300; -int weight_p = 64; /* old backlog weight */ +int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +int netdev_tstamp_prequeue __read_mostly = 1; +int netdev_budget __read_mostly = 300; +int weight_p __read_mostly = 64; /* old backlog weight */ +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +struct static_key rps_needed __read_mostly; + +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + if (next_cpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb_get_hash(skb) & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, next_cpu).input_queue_head; + } + + rflow->cpu = next_cpu; + return rflow; +} + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + u32 hash; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1 && + !rcu_access_pointer(rxqueue->rps_flow_table)) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + goto done; + } + + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + goto done; + + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[hash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) { + tcpu = next_cpu; + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + } + + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + + if (map) { + tcpu = map->cpus[((u64) hash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + return cpu; +} + +#ifdef CONFIG_RFS_ACCEL /** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_CN_LOW (low congestion) - * NET_RX_CN_MOD (moderate congestion) - * NET_RX_CN_HIGH (high congestion) - * NET_RX_DROP (packet was dropped) + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; -int netif_rx(struct sk_buff *skb) + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) { - struct softnet_data *queue; - unsigned long flags; + struct softnet_data *sd = data; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} - if (!skb->tstamp.off_sec) - net_timestamp(skb); +#endif /* CONFIG_RPS */ + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; +} + +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) +{ + struct softnet_data *sd; + unsigned long flags; + unsigned int qlen; + + sd = &per_cpu(softnet_data, cpu); - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ local_irq_save(flags); - queue = &__get_cpu_var(softnet_data); - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { + rps_lock(sd); + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: - dev_hold(skb->dev); - __skb_queue_tail(&queue->input_pkt_queue, skb); + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; } - netif_rx_schedule(&queue->backlog_dev); + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } goto enqueue; } - __get_cpu_var(netdev_rx_stat).dropped++; + sd->dropped++; + rps_unlock(sd); + local_irq_restore(flags); + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } +static int netif_rx_internal(struct sk_buff *skb) +{ + int ret; + + net_timestamp_check(netdev_tstamp_prequeue, skb); + + trace_netif_rx(skb); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + + preempt_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + cpu = smp_processor_id(); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); + preempt_enable(); + } else +#endif + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } + return ret; +} + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + trace_netif_rx_entry(skb); + + return netif_rx_internal(skb); +} +EXPORT_SYMBOL(netif_rx); + int netif_rx_ni(struct sk_buff *skb) { int err; + trace_netif_rx_ni_entry(skb); + preempt_disable(); - err = netif_rx(skb); + err = netif_rx_internal(skb); if (local_softirq_pending()) do_softirq(); preempt_enable(); return err; } - EXPORT_SYMBOL(netif_rx_ni); -static inline struct net_device *skb_bond(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - - if (dev->master) - skb->dev = dev->master; - - return dev; -} - static void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = &__get_cpu_var(softnet_data); @@ -1459,70 +3369,57 @@ static void net_tx_action(struct softirq_action *h) struct sk_buff *skb = clist; clist = clist->next; - BUG_TRAP(!atomic_read(&skb->users)); + WARN_ON(atomic_read(&skb->users)); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } if (sd->output_queue) { - struct net_device *head; + struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { - struct net_device *dev = head; - head = head->next_sched; + struct Qdisc *q = head; + spinlock_t *root_lock; - smp_mb__before_clear_bit(); - clear_bit(__LINK_STATE_SCHED, &dev->state); + head = head->next_sched; - if (spin_trylock(&dev->queue_lock)) { - qdisc_run(dev); - spin_unlock(&dev->queue_lock); + root_lock = qdisc_lock(q); + if (spin_trylock(root_lock)) { + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + qdisc_run(q); + spin_unlock(root_lock); } else { - netif_schedule(dev); + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &q->state)) { + __netif_reschedule(q); + } else { + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } } } } } -static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); -struct net_bridge; -struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, - unsigned char *addr); -void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); - -static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) -{ - struct net_bridge_port *port; - - if ((*pskb)->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) - return 0; - - if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - return br_handle_frame_hook(port, pskb); -} -#else -#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ + (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif #ifdef CONFIG_NET_CLS_ACT @@ -1530,67 +3427,172 @@ static __inline__ int handle_bridge(struct sk_buff **pskb, * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions * a compare and 2 stores extra right now if we dont have it on * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress. * */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { - struct Qdisc *q; struct net_device *dev = skb->dev; + u32 ttl = G_TC_RTTL(skb->tc_verd); int result = TC_ACT_OK; - - if (dev->qdisc_ingress) { - __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); - if (MAX_RED_LOOP < ttl++) { - printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev->name, skb->dev->name); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); + struct Qdisc *q; - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); + if (unlikely(MAX_RED_LOOP < ttl++)) { + net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", + skb->skb_iif, dev->ifindex); + return TC_ACT_SHOT; + } - spin_lock(&dev->ingress_lock); - if ((q = dev->qdisc_ingress) != NULL) - result = q->enqueue(skb, q); - spin_unlock(&dev->ingress_lock); + skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + q = rxq->qdisc; + if (q != &noop_qdisc) { + spin_lock(qdisc_lock(q)); + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) + result = qdisc_enqueue_root(skb, q); + spin_unlock(qdisc_lock(q)); } return result; } + +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) + goto out; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + switch (ing_filter(skb, rxq)) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NULL; + } + +out: + skb->tc_verd = 0; + return skb; +} #endif -int netif_receive_skb(struct sk_buff *skb) +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * @rx_handler_data: data pointer that is used by rx handler + * + * Register a receive handler for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. + */ +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler, + void *rx_handler_data) +{ + ASSERT_RTNL(); + + if (dev->rx_handler) + return -EBUSY; + + /* Note: rx_handler_data must be set before rx_handler */ + rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); + rcu_assign_pointer(dev->rx_handler, rx_handler); + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); + +/** + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from + * + * Unregister a receive handler from a device. + * + * The caller must hold the rtnl_mutex. + */ +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + RCU_INIT_POINTER(dev->rx_handler, NULL); + /* a reader seeing a non NULL rx_handler in a rcu_read_lock() + * section has a guarantee to see a non NULL rx_handler_data + * as well. + */ + synchronize_net(); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +/* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. + */ +static bool skb_pfmemalloc_protocol(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} + +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; struct net_device *orig_dev; + struct net_device *null_or_dev; + bool deliver_exact = false; int ret = NET_RX_DROP; - unsigned short type; - - /* if we've gotten here through NAPI, check netpoll */ - if (skb->dev->poll && netpoll_rx(skb)) - return NET_RX_DROP; - - if (!skb->tstamp.off_sec) - net_timestamp(skb); + __be16 type; - if (!skb->input_dev) - skb->input_dev = skb->dev; + net_timestamp_check(!netdev_tstamp_prequeue, skb); - orig_dev = skb_bond(skb); + trace_netif_receive_skb(skb); - __get_cpu_var(netdev_rx_stat).total++; + orig_dev = skb->dev; - skb->h.raw = skb->nh.raw = skb->data; - skb->mac_len = skb->nh.raw - skb->mac.raw; + skb_reset_network_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); pt_prev = NULL; rcu_read_lock(); +another_round: + skb->skb_iif = skb->dev->ifindex; + + __this_cpu_inc(softnet_data.processed); + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto unlock; + } + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -1598,51 +3600,93 @@ int netif_receive_skb(struct sk_buff *skb) } #endif + if (pfmemalloc) + goto skip_taps; + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) + if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } +skip_taps: #ifdef CONFIG_NET_CLS_ACT - if (pt_prev) { - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = NULL; /* noone else should process this after*/ - } else { - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - } + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto unlock; +ncls: +#endif - ret = ing_filter(skb); + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + goto drop; - if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { - kfree_skb(skb); - goto out; + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) + goto unlock; } - skb->tc_verd = 0; -ncls: -#endif + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: + ret = NET_RX_SUCCESS; + goto unlock; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } + } - handle_diverter(skb); + if (unlikely(vlan_tx_tag_present(skb))) { + if (vlan_tx_tag_get_id(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + skb->vlan_tci = 0; + } - if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) - goto out; + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; type = skb->protocol; - list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && - (!ptype->dev || ptype->dev == skb->dev)) { - if (pt_prev) + (ptype->dev == null_or_dev || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { + if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + goto drop; + else + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { +drop: + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -1650,439 +3694,1568 @@ ncls: ret = NET_RX_DROP; } +unlock: + rcu_read_unlock(); + return ret; +} + +static int __netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { + unsigned long pflags = current->flags; + + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + current->flags |= PF_MEMALLOC; + ret = __netif_receive_skb_core(skb, true); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + } else + ret = __netif_receive_skb_core(skb, false); + + return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) +{ + net_timestamp_check(netdev_tstamp_prequeue, skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); + } +#endif + return __netif_receive_skb(skb); +} + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + trace_netif_receive_skb_entry(skb); + + return netif_receive_skb_internal(skb); +} +EXPORT_SYMBOL(netif_receive_skb); + +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ +static void flush_backlog(void *arg) +{ + struct net_device *dev = arg; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct sk_buff *skb, *tmp; + + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->input_pkt_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } +} + +static int napi_gro_complete(struct sk_buff *skb) +{ + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = ptype->callbacks.gro_complete(skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + out: + return netif_receive_skb_internal(skb); +} + +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + struct sk_buff *skb, *prev = NULL; + + /* scan list and build reverse chain */ + for (skb = napi->gro_list; skb != NULL; skb = skb->next) { + skb->prev = prev; + prev = skb; + } + + for (skb = prev; skb; skb = prev) { + skb->next = NULL; + + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + + prev = skb->prev; + napi_gro_complete(skb); + napi->gro_count--; + } + + napi->gro_list = NULL; +} +EXPORT_SYMBOL(napi_gro_flush); + +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + + for (p = napi->gro_list; p; p = p->next) { + unsigned long diffs; + + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + pinfo->frags[0].page_offset += grow; + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int same_flow; + enum gro_result ret; + int grow; + + if (!(skb->dev->features & NETIF_F_GRO)) + goto normal; + + if (skb_is_gso(skb) || skb_has_frag_list(skb)) + goto normal; + + gro_list_prepare(napi, skb); + NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->udp_mark = 0; + + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); + break; + } rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + struct sk_buff *nskb = *pp; + + *pp = nskb->next; + nskb->next = NULL; + napi_gro_complete(nskb); + napi->gro_count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { + struct sk_buff *nskb = napi->gro_list; + + /* locate the end of the list to select the 'oldest' flow */ + while (nskb->next) { + pp = &nskb->next; + nskb = *pp; + } + *pp = NULL; + nskb->next = NULL; + napi_gro_complete(nskb); + } else { + napi->gro_count++; + } + NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + skb->next = napi->gro_list; + napi->gro_list = skb; + ret = GRO_HELD; + +pull: + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); +ok: return ret; + +normal: + ret = GRO_NORMAL; + goto pull; } -static int process_backlog(struct net_device *backlog_dev, int *budget) +struct packet_offload *gro_find_receive_by_type(__be16 type) { - int work = 0; - int quota = min(backlog_dev->quota, *budget); - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; - backlog_dev->weight = weight_p; - for (;;) { - struct sk_buff *skb; - struct net_device *dev; + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +{ + switch (ret) { + case GRO_NORMAL: + if (netif_receive_skb_internal(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + kfree_skb(skb); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + kmem_cache_free(skbuff_head_cache, skb); + else + __kfree_skb(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + break; + } + + return ret; +} + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb); + + return napi_skb_finish(dev_gro_receive(napi, skb), skb); +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; + skb->dev = napi->dev; + skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + napi->skb = skb; + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + case GRO_MERGED_FREE: + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header_fast(skb, 0); + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi_frags_skb(napi); + + if (!skb) + return GRO_DROP; + + trace_napi_gro_frags_entry(skb); + + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); +} +EXPORT_SYMBOL(napi_gro_frags); + +/* + * net_rps_action_and_irq_enable sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) - goto job_done; local_irq_enable(); - dev = skb->dev; + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; - netif_receive_skb(skb); + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, + &remsd->csd); + remsd = next; + } + } else +#endif + local_irq_enable(); +} - dev_put(dev); +static int process_backlog(struct napi_struct *napi, int quota) +{ + int work = 0; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); - work++; +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif + napi->weight = weight_p; + local_irq_disable(); + while (1) { + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } + } + + rps_lock(sd); + if (skb_queue_empty(&sd->input_pkt_queue)) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + rps_unlock(sd); - if (work >= quota || jiffies - start_time > 1) break; + } + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + rps_unlock(sd); } + local_irq_enable(); - backlog_dev->quota -= work; - *budget -= work; - return -1; + return work; +} -job_done: - backlog_dev->quota -= work; - *budget -= work; +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run + */ +void __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; - list_del(&backlog_dev->poll_list); - smp_mb__before_clear_bit(); - netif_poll_enable(backlog_dev); + local_irq_save(flags); + ____napi_schedule(&__get_cpu_var(softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); - local_irq_enable(); - return 0; +void __napi_complete(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + BUG_ON(n->gro_list); + + list_del(&n->poll_list); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); +} +EXPORT_SYMBOL(__napi_complete); + +void napi_complete(struct napi_struct *n) +{ + unsigned long flags; + + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + return; + + napi_gro_flush(n, false); + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(napi_complete); + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + napi->gro_count = 0; + napi->gro_list = NULL; + napi->skb = NULL; + napi->poll = poll; + if (weight > NAPI_POLL_WEIGHT) + pr_err_once("netif_napi_add() called with weight %d on device %s\n", + weight, dev->name); + napi->weight = weight; + list_add(&napi->dev_list, &dev->napi_list); + napi->dev = dev; +#ifdef CONFIG_NETPOLL + spin_lock_init(&napi->poll_lock); + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); } +EXPORT_SYMBOL(netif_napi_add); + +void netif_napi_del(struct napi_struct *napi) +{ + list_del_init(&napi->dev_list); + napi_free_frags(napi); + + kfree_skb_list(napi->gro_list); + napi->gro_list = NULL; + napi->gro_count = 0; +} +EXPORT_SYMBOL(netif_napi_del); static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; local_irq_disable(); - while (!list_empty(&queue->poll_list)) { - struct net_device *dev; + while (!list_empty(&sd->poll_list)) { + struct napi_struct *n; + int work, weight; - if (budget <= 0 || jiffies - start_time > 1) + /* If softirq window is exhuasted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); - dev = list_entry(queue->poll_list.next, - struct net_device, poll_list); - have = netpoll_poll_lock(dev); + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); - if (dev->quota <= 0 || dev->poll(dev, &budget)) { - netpoll_poll_unlock(have); - local_irq_disable(); - list_del(&dev->poll_list); - list_add_tail(&dev->poll_list, &queue->poll_list); - if (dev->quota < 0) - dev->quota += dev->weight; - else - dev->quota = dev->weight; - } else { - netpoll_poll_unlock(have); - dev_put(dev); - local_irq_disable(); + have = netpoll_poll_lock(n); + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); } + + WARN_ON_ONCE(work > weight); + + budget -= work; + + local_irq_disable(); + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(work == weight)) { + if (unlikely(napi_disable_pending(n))) { + local_irq_enable(); + napi_complete(n); + local_irq_disable(); + } else { + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + local_irq_enable(); + napi_gro_flush(n, HZ >= 1000); + local_irq_disable(); + } + list_move_tail(&n->poll_list, &sd->poll_list); + } + } + + netpoll_poll_unlock(have); } out: - local_irq_enable(); + net_rps_action_and_irq_enable(sd); + +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + dma_issue_pending_all(); +#endif + return; softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; + sd->time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } -static gifconf_func_t * gifconf_list [NPROTO]; +struct netdev_adjacent { + struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ + bool master; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + + /* private field for the users */ + void *private; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; +} /** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock. */ -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +bool netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) { - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} + ASSERT_RTNL(); + return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); +} +EXPORT_SYMBOL(netdev_has_upper_dev); -/* - * Map an interface index to its name (SIOCGIFNAME) +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock. */ +static bool netdev_has_any_upper_dev(struct net_device *dev) +{ + ASSERT_RTNL(); -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb + return !list_empty(&dev->all_adj_list.upper); +} + +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master)) + return upper->dev; + return NULL; +} +EXPORT_SYMBOL(netdev_master_upper_dev_get); -static int dev_ifname(struct ifreq __user *arg) +void *netdev_adjacent_get_private(struct list_head *adj_list) { - struct net_device *dev; - struct ifreq ifr; + struct netdev_adjacent *adj; - /* - * Fetch the caller's info block. - */ + adj = list_entry(adj_list, struct netdev_adjacent, list); - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); - read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifr.ifr_ifindex); - if (!dev) { - read_unlock(&dev_base_lock); - return -ENODEV; - } +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; - strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; } +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. */ +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->all_adj_list.upper) + return NULL; + + *iter = &upper->list; -static int dev_ifconf(char __user *arg) + return upper->dev; +} +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) { - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + struct netdev_adjacent *lower; - /* - * Fetch the caller's info block. - */ + lower = list_entry(*iter, struct netdev_adjacent, list); - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; + if (&lower->list == &dev->adj_list.lower) + return NULL; - pos = ifc.ifc_buf; - len = ifc.ifc_len; + *iter = lower->list.next; - /* - * Loop over the interfaces, and write an info block for each. - */ + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); - total = 0; - for (dev = dev_base; dev; dev = dev->next) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } - } - } +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; + WARN_ON_ONCE(!rcu_read_lock_held()); - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->private; } +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); -#ifdef CONFIG_PROC_FS -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. */ -static __inline__ struct net_device *dev_get_idx(loff_t pos) +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { - struct net_device *dev; - loff_t i; + struct netdev_adjacent *lower; - for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); + lower = list_entry((*iter)->next, struct netdev_adjacent, list); - return i == pos ? dev : NULL; + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; } +EXPORT_SYMBOL(netdev_lower_get_next); -void *dev_seq_start(struct seq_file *seq, loff_t *pos) +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) { - read_lock(&dev_base_lock); - return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; } +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - ++*pos; - return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; + struct netdev_adjacent *upper; + + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (upper && likely(upper->master)) + return upper->dev; + return NULL; } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -void dev_seq_stop(struct seq_file *seq, void *v) +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) { - read_unlock(&dev_base_lock); + char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); } -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ + (dev_list == &dev->adj_list.upper || \ + dev_list == &dev->adj_list.lower) + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list, + void *private, bool master) { - if (dev->get_stats) { - struct net_device_stats *stats = dev->get_stats(dev); + struct netdev_adjacent *adj; + int ret; - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); - } else - seq_printf(seq, "%6s: No statistics available.\n", dev->name); + adj = __netdev_find_adj(dev, adj_dev, dev_list); + + if (adj) { + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = 1; + adj->private = private; + dev_hold(adj_dev); + + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } + + return 0; + +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + kfree(adj); + dev_put(adj_dev); + + return ret; } -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev - */ -static int dev_seq_show(struct seq_file *seq, void *v) +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); - else - dev_seq_printf_stats(seq, v); - return 0; + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, dev_list); + + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); + BUG(); + } + + if (adj->ref_nr > 1) { + pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, + adj->ref_nr-1); + adj->ref_nr--; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (netdev_adjacent_is_neigh_list(dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); } -static struct netif_rx_stats *softnet_get_online(loff_t *pos) +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) { - struct netif_rx_stats *rc = NULL; + int ret; - while (*pos < NR_CPUS) - if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); - break; - } else - ++*pos; - return rc; + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, + master); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, + false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + return ret; + } + + return 0; } -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +static int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return softnet_get_online(pos); + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + NULL, false); } -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { - ++*pos; - return softnet_get_online(pos); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } -static void softnet_seq_stop(struct seq_file *seq, void *v) +static void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); } -static int softnet_seq_show(struct seq_file *seq, void *v) +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) { - struct netif_rx_stats *s = v; + int ret = __netdev_adjacent_dev_link(dev, upper_dev); + + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev); + return ret; + } - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, 0, - 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision ); return 0; } -static struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; - -static int dev_seq_open(struct inode *inode, struct file *file) +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) { - return seq_open(file, &dev_seq_ops); + __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); } -static struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master, + void *private) +{ + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; -static struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; + ASSERT_RTNL(); -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); -} + if (dev == upper_dev) + return -EBUSY; -static struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; + /* To prevent loops, check if dev is not upper device to upper_dev. */ + if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) + return -EBUSY; -#ifdef WIRELESS_EXT -extern int wireless_proc_init(void); -#else -#define wireless_proc_init() 0 -#endif + if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) + return -EEXIST; + + if (master && netdev_master_upper_dev_get(dev)) + return -EBUSY; + + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); + if (ret) + return ret; + + /* Now that we linked these devs, make all the upper_dev's + * all_adj_list.upper visible to every dev's all_adj_list.lower an + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } + + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } -static int __init dev_proc_init(void) + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + return ret; +} + +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - int rc = -ENOMEM; + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); +} +EXPORT_SYMBOL(netdev_upper_dev_link); - if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (wireless_proc_init()) - goto out_softnet; - rc = 0; -out: - return rc; -out_softnet: - proc_net_remove("softnet_stat"); -out_dev: - proc_net_remove("dev"); - goto out; +/** + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. + */ +int netdev_master_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ +EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); /** - * netdev_set_master - set up master/slave pair - * @slave: slave device - * @master: new master device + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +void netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_adjacent *i, *j; + ASSERT_RTNL(); + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->all_adj_list.lower, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); +} +EXPORT_SYMBOL(netdev_upper_dev_unlink); + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) { - struct net_device *old = slave->master; + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; ASSERT_RTNL(); - if (master) { - if (old) - return -EBUSY; - dev_hold(master); + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; } - slave->master = master; - - synchronize_net(); + if (type_check(dev)) + max_nest++; - if (old) - dev_put(old); + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; +static void dev_change_rx_flags(struct net_device *dev, int flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); +} + +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) +{ + unsigned int old_flags = dev->flags; + kuid_t uid; + kgid_t gid; - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + ASSERT_RTNL(); + + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", + dev->name); + return -EOVERFLOW; + } + } + if (dev->flags != old_flags) { + pr_info("device %s %s promiscuous mode\n", + dev->name, + dev->flags & IFF_PROMISC ? "entered" : "left"); + if (audit_enabled) { + current_uid_gid(&uid, &gid); + audit_log(current->audit_context, GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + audit_get_sessionid(current)); + } + + dev_change_rx_flags(dev, IFF_PROMISC); + } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -2091,25 +5264,56 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) * @dev: device * @inc: modifier * - * Add or remove promsicuity from a device. While the count in the device + * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. */ -void dev_set_promiscuity(struct net_device *dev, int inc) +int dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; + int err; - if ((dev->promiscuity += inc) == 0) - dev->flags &= ~IFF_PROMISC; - else - dev->flags |= IFF_PROMISC; - if (dev->flags != old_flags) { - dev_mc_upload(dev); - printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags & IFF_PROMISC) ? "entered" : - "left"); + err = __dev_set_promiscuity(dev, inc, true); + if (err < 0) + return err; + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + return err; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +{ + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + + ASSERT_RTNL(); + + dev->flags |= IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", + dev->name); + return -EOVERFLOW; + } + } + if (dev->flags ^ old_flags) { + dev_change_rx_flags(dev, IFF_ALLMULTI); + dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } + return 0; } /** @@ -2122,39 +5326,93 @@ void dev_set_promiscuity(struct net_device *dev, int inc) * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. */ -void dev_set_allmulti(struct net_device *dev, int inc) +int dev_set_allmulti(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + return __dev_set_allmulti(dev, inc, true); +} +EXPORT_SYMBOL(dev_set_allmulti); - dev->flags |= IFF_ALLMULTI; - if ((dev->allmulti += inc) == 0) - dev->flags &= ~IFF_ALLMULTI; - if (dev->flags ^ old_flags) - dev_mc_upload(dev); +/* + * Upload unicast and multicast address lists to device and + * configure RX filtering. When the device doesn't support unicast + * filtering it is put in promiscuous mode while unicast addresses + * are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags&IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { + /* Unicast addresses changes may only happen under the rtnl, + * therefore calling __dev_set_promiscuity here is safe. + */ + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1, false); + dev->uc_promisc = true; + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1, false); + dev->uc_promisc = false; + } + } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); } -unsigned dev_get_flags(const struct net_device *dev) +/** + * dev_get_flags - get flags reported to userspace + * @dev: device + * + * Get the combination of flag bits exported through APIs to userspace. + */ +unsigned int dev_get_flags(const struct net_device *dev) { - unsigned flags; + unsigned int flags; flags = (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI | - IFF_RUNNING)) | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); - if (netif_running(dev) && netif_carrier_ok(dev)) - flags |= IFF_RUNNING; + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } return flags; } +EXPORT_SYMBOL(dev_get_flags); -int dev_change_flags(struct net_device *dev, unsigned flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags) { + unsigned int old_flags = dev->flags; int ret; - int old_flags = dev->flags; + + ASSERT_RTNL(); /* * Set the flags on our device. @@ -2170,7 +5428,10 @@ int dev_change_flags(struct net_device *dev, unsigned flags) * Load in the correct multicast list now the flags have changed. */ - dev_mc_upload(dev); + if ((old_flags ^ flags) & IFF_MULTICAST) + dev_change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves @@ -2180,21 +5441,21 @@ int dev_change_flags(struct net_device *dev, unsigned flags) ret = 0; if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); if (!ret) - dev_mc_upload(dev); + dev_set_rx_mode(dev); } - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - if ((flags ^ dev->gflags) & IFF_PROMISC) { - int inc = (flags & IFF_PROMISC) ? +1 : -1; + int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; + dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -2202,20 +5463,84 @@ int dev_change_flags(struct net_device *dev, unsigned flags) IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { - int inc = (flags & IFF_ALLMULTI) ? +1 : -1; + int inc = (flags & IFF_ALLMULTI) ? 1 : -1; + dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); + } + + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); } - if (old_flags ^ dev->flags) - rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags) +{ + int ret; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes); return ret; } +EXPORT_SYMBOL(dev_change_flags); + +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + dev->mtu = new_mtu; + return 0; +} +/** + * dev_set_mtu - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * + * Change the maximum transfer size of the network device. + */ int dev_set_mtu(struct net_device *dev, int new_mtu) { - int err; + int err, orig_mtu; if (new_mtu == dev->mtu) return 0; @@ -2227,405 +5552,444 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (dev->change_mtu) - err = dev->change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; - if (!err && dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEMTU, dev); + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; + + orig_mtu = dev->mtu; + err = __dev_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __dev_set_mtu(dev, orig_mtu); + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + } + } return err; } +EXPORT_SYMBOL(dev_set_mtu); +/** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** + * dev_set_mac_address - Change Media Access Control Address + * @dev: device + * @sa: new address + * + * Change the hardware (MAC) address of the device + */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) { + const struct net_device_ops *ops = dev->netdev_ops; int err; - if (!dev->set_mac_address) + if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev->set_mac_address(dev, sa); - if (!err) - notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - return err; + err = ops->ndo_set_mac_address(dev, sa); + if (err) + return err; + dev->addr_assign_type = NET_ADDR_SET; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + add_device_randomness(dev->dev_addr, dev->addr_len); + return 0; } +EXPORT_SYMBOL(dev_set_mac_address); -/* - * Perform the SIOCxIFxxx calls. +/** + * dev_change_carrier - Change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier */ -static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +int dev_change_carrier(struct net_device *dev, bool new_carrier) { - int err; - struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + const struct net_device_ops *ops = dev->netdev_ops; - if (!dev) + if (!ops->ndo_change_carrier) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); +} +EXPORT_SYMBOL(dev_change_carrier); - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = dev_get_flags(dev); - return 0; +/** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; +/** + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(struct net *net) +{ + int ifindex = net->ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return net->ifindex = ifindex; + } +} - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; +/* Delayed registration/unregisteration */ +static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; +} - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); +static void rollback_registered_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(close_head); - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return 0; + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + /* If device is running, close it first. */ + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head); - case SIOCSIFMAP: - if (dev->set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return dev->set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if (!dev->set_multicast_list || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCDELMULTI: - if (!dev->set_multicast_list || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; + dev->reg_state = NETREG_UNREGISTERING; + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); /* - * Unknown or private ioctl + * Flush the unicast and multicast chains */ + dev_uc_flush(dev); + dev_mc_flush(dev); - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (dev->do_ioctl) { - if (netif_device_present(dev)) - err = dev->do_ioctl(dev, ifr, - cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif } - return err; + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); } -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); -/** - * dev_ioctl - network device ioctl - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space - * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. - */ + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); + list_del(&single); +} -int dev_ioctl(unsigned int cmd, void __user *arg) +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) { - struct ifreq ifr; - int ret; - char *colon; + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } - if (cmd == SIOCGIFCONF) { - rtnl_shlock(); - ret = dev_ifconf((char __user *) arg); - rtnl_shunlock(); - return ret; + if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IP_CSUM)) { + netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO; + features &= ~NETIF_F_TSO_ECN; } - if (cmd == SIOCGIFNAME) - return dev_ifname((struct ifreq __user *)arg); - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IPV6_CSUM)) { + netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO6; + } - ifr.ifr_name[IFNAMSIZ-1] = 0; + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; + } - /* - * See which interface the caller is talking about. - */ + /* UFO needs SG and checksumming */ + if (features & NETIF_F_UFO) { + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); + features &= ~NETIF_F_UFO; + } - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(ifr.ifr_name); - read_lock(&dev_base_lock); - ret = dev_ifsioc(&ifr, cmd); - read_unlock(&dev_base_lock); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; + if (!(features & NETIF_F_SG)) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); + features &= ~NETIF_F_UFO; + } + } - case SIOCETHTOOL: - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(&ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; + return features; +} - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - return ret; +int __netdev_update_features(struct net_device *dev) +{ + netdev_features_t features; + int err = 0; - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -EINVAL; + ASSERT_RTNL(); - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } -#ifdef WIRELESS_EXT - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - /* If command is `set a parameter', or - * `get the encoding parameters', check if - * the user has the right to do it */ - if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - } - dev_load(ifr.ifr_name); - rtnl_lock(); - /* Follow me in net/core/wireless.c */ - ret = wireless_process_ioctl(&ifr, cmd); - rtnl_unlock(); - if (IW_IS_GET(cmd) && - copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } -#endif /* WIRELESS_EXT */ - return -EINVAL; + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return 0; + + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (unlikely(err < 0)) { + netdev_err(dev, + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); + return -1; } + + if (!err) + dev->features = features; + + return 1; } +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); /** - * dev_new_index - allocate an ifindex + * netdev_change_features - recalculate device features + * @dev: the device to check * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. */ -static int dev_new_index(void) +void netdev_change_features(struct net_device *dev) { - static int ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(ifindex)) - return ifindex; + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); } } +EXPORT_SYMBOL(netif_stacked_transfer_operstate); -static int dev_boot_phase = 1; +#ifdef CONFIG_SYSFS +static int netif_alloc_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; -/* Delayed registration/unregisteration */ -static DEFINE_SPINLOCK(net_todo_list_lock); -static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); + BUG_ON(count < 1); + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) + return -ENOMEM; + + dev->_rx = rx; -static inline void net_set_todo(struct net_device *dev) + for (i = 0; i < count; i++) + rx[i].dev = dev; + return 0; +} +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) { - spin_lock(&net_todo_list_lock); - list_add_tail(&dev->todo_list, &net_todo_list); - spin_unlock(&net_todo_list_lock); + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif +} + +static void netif_free_tx_queues(struct net_device *dev) +{ + kvfree(dev->_tx); +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); + + BUG_ON(count < 1 || count > 0xffff); + + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!tx) { + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; + } + dev->_tx = tx; + + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); + + return 0; } /** @@ -2647,98 +6011,89 @@ static inline void net_set_todo(struct net_device *dev) int register_netdevice(struct net_device *dev) { - struct hlist_head *head; - struct hlist_node *p; int ret; + struct net *net = dev_net(dev); BUG_ON(dev_boot_phase); ASSERT_RTNL(); + might_sleep(); + /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!net); - spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); - dev->xmit_lock_owner = -1; -#ifdef CONFIG_NET_CLS_ACT - spin_lock_init(&dev->ingress_lock); -#endif - - ret = alloc_divert_blk(dev); - if (ret) - goto out; + spin_lock_init(&dev->addr_list_lock); + netdev_set_addr_lockdep_class(dev); dev->iflink = -1; + ret = dev_get_valid_name(net, dev, dev->name); + if (ret < 0) + goto out; + /* Init, if this function is available */ - if (dev->init) { - ret = dev->init(dev); + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; - goto out_err; + goto out; } } - - if (!dev_valid_name(dev->name)) { + + if (((dev->hw_features | dev->features) & + NETIF_F_HW_VLAN_CTAG_FILTER) && + (!dev->netdev_ops->ndo_vlan_rx_add_vid || + !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { + netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; - goto out_err; + goto err_uninit; } - dev->ifindex = dev_new_index(); + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Check for existence of name */ - head = dev_name_hash(dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto out_err; - } - } + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = dev->features & dev->hw_features; - /* Fix illegal SG+CSUM combinations. */ - if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { - printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", - dev->name); - dev->features &= ~NETIF_F_SG; + if (!(dev->flags & IFF_LOOPBACK)) { + dev->hw_features |= NETIF_F_NOCACHE_COPY; } - /* TSO requires that SG is present as well. */ - if ((dev->features & NETIF_F_TSO) && - !(dev->features & NETIF_F_SG)) { - printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", - dev->name); - dev->features &= ~NETIF_F_TSO; - } - if (dev->features & NETIF_F_UFO) { - if (!(dev->features & NETIF_F_HW_CSUM)) { - printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " - "NETIF_F_HW_CSUM feature.\n", - dev->name); - dev->features &= ~NETIF_F_UFO; - } - if (!(dev->features & NETIF_F_SG)) { - printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " - "NETIF_F_SG feature.\n", - dev->name); - dev->features &= ~NETIF_F_UFO; - } - } + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. + */ + dev->vlan_features |= NETIF_F_HIGHDMA; - /* - * nil rebuild_header routine, - * that should be never called and used as just bug trap. + /* Make NETIF_F_SG inheritable to tunnel devices. */ + dev->hw_enc_features |= NETIF_F_SG; - if (!dev->rebuild_header) - dev->rebuild_header = default_rebuild_header; + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + + ret = netdev_register_kobject(dev); + if (ret) + goto err_uninit; + dev->reg_state = NETREG_REGISTERED; + + __netdev_update_features(dev); /* * Default initial state at registry is that the @@ -2747,30 +6102,85 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); - dev->next = NULL; + linkwatch_init_dev(dev); + dev_init_scheduler(dev); - write_lock_bh(&dev_base_lock); - *dev_tail = dev; - dev_tail = &dev->next; - hlist_add_head(&dev->name_hlist, head); - hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); dev_hold(dev); - dev->reg_state = NETREG_REGISTERING; - write_unlock_bh(&dev_base_lock); + list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); - /* Notify protocols, that a new device appeared. */ - notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + /* If the device has permanent device address, driver should + * set dev_addr and also addr_assign_type should be set to + * NET_ADDR_PERM (default value). + */ + if (dev->addr_assign_type == NET_ADDR_PERM) + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); - /* Finish registration after unlock */ - net_set_todo(dev); - ret = 0; + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { + rollback_registered(dev); + dev->reg_state = NETREG_UNREGISTERED; + } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; -out_err: - free_divert_blk(dev); + +err_uninit: + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); goto out; } +EXPORT_SYMBOL(register_netdevice); + +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + /** * register_netdev - register a network device @@ -2781,7 +6191,7 @@ out_err: * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * - * This is a wrapper around register_netdev that takes the rtnl semaphore + * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ @@ -2790,35 +6200,25 @@ int register_netdev(struct net_device *dev) int err; rtnl_lock(); - - /* - * If the name is a format string the caller wants us to do a - * name allocation. - */ - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto out; - } - - /* - * Back compatibility hook. Kill this one in 2.5 - */ - if (dev->name[0] == 0 || dev->name[0] == ' ') { - err = dev_alloc_name(dev, "eth%d"); - if (err < 0) - goto out; - } - err = register_netdevice(dev); -out: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_netdev); -/* +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + +/** * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device * * This is called when unregistering network devices. * @@ -2826,21 +6226,30 @@ EXPORT_SYMBOL(register_netdev); * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly - * call dev_put. + * call dev_put. */ static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; + + linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { - rtnl_shlock(); + rtnl_lock(); /* Rebroadcast unregister notification */ - notifier_call_chain(&netdev_chain, - NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + __rtnl_unlock(); + rcu_barrier(); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -2852,18 +6261,18 @@ static void netdev_wait_allrefs(struct net_device *dev) linkwatch_run_queue(); } - rtnl_shunlock(); + __rtnl_unlock(); rebroadcast_time = jiffies; } msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { - printk(KERN_EMERG "unregister_netdevice: " - "waiting for %s to become free. Usage " - "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, refcnt); warning_time = jiffies; } } @@ -2883,274 +6292,522 @@ static void netdev_wait_allrefs(struct net_device *dev) * free_netdev(y1); * free_netdev(y2); * - * We are invoked by rtnl_unlock() after it drops the semaphore. + * We are invoked by rtnl_unlock(). * This allows us to deal with problems: - * 1) We can create/delete sysfs objects which invoke hotplug + * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. + * + * We must not return until all unregister events added during + * the interval the lock was held have been completed. */ -static DECLARE_MUTEX(net_todo_run_mutex); void netdev_run_todo(void) { - struct list_head list = LIST_HEAD_INIT(list); - int err; + struct list_head list; + /* Snapshot list, allow later requests */ + list_replace_init(&net_todo_list, &list); - /* Need to guard against multiple cpu's getting out of order. */ - down(&net_todo_run_mutex); + __rtnl_unlock(); - /* Not safe to do outside the semaphore. We must not return - * until all unregister events invoked by the local processor - * have been completed (either by this todo run, or one on - * another cpu). - */ - if (list_empty(&net_todo_list)) - goto out; - /* Snapshot list, allow later requests */ - spin_lock(&net_todo_list_lock); - list_splice_init(&net_todo_list, &list); - spin_unlock(&net_todo_list_lock); - + /* Wait for rcu callbacks to finish before next phase */ + if (!list_empty(&list)) + rcu_barrier(); + while (!list_empty(&list)) { struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); + = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); - switch(dev->reg_state) { - case NETREG_REGISTERING: - err = netdev_register_sysfs(dev); - if (err) - printk(KERN_ERR "%s: failed sysfs registration (%d)\n", - dev->name, err); - dev->reg_state = NETREG_REGISTERED; - break; + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); - case NETREG_UNREGISTERING: - netdev_unregister_sysfs(dev); - dev->reg_state = NETREG_UNREGISTERED; + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + pr_err("network todo '%s' but state %d\n", + dev->name, dev->reg_state); + dump_stack(); + continue; + } - netdev_wait_allrefs(dev); + dev->reg_state = NETREG_UNREGISTERED; - /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); - BUG_TRAP(!dev->ip_ptr); - BUG_TRAP(!dev->ip6_ptr); - BUG_TRAP(!dev->dn_ptr); + on_each_cpu(flush_backlog, dev, 1); + netdev_wait_allrefs(dev); - /* It must be the very last action, - * after this 'dev' may point to freed up memory. - */ - if (dev->destructor) - dev->destructor(dev); - break; + /* paranoia */ + BUG_ON(netdev_refcnt_read(dev)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + WARN_ON(dev->dn_ptr); - default: - printk(KERN_ERR "network todo '%s' but state %d\n", - dev->name, dev->reg_state); - break; - } + if (dev->destructor) + dev->destructor(dev); + + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + + /* Free network device */ + kobject_put(&dev->dev.kobj); } +} -out: - up(&net_todo_run_mutex); +/* Convert net_device_stats to rtnl_link_stats64. They have the same + * fields in the same order, with only the type differing. + */ +void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 + BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else + size_t i, n = sizeof(*stats64) / sizeof(u64); + const unsigned long *src = (const unsigned long *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != + sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = src[i]; +#endif } +EXPORT_SYMBOL(netdev_stats_to_stats64); /** - * alloc_netdev - allocate network device + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * @storage: place to store stats + * + * Get network statistics from device. Return @storage. + * The device driver may provide its own method by setting + * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + * otherwise the internal statistics structure is used. + */ +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_stats64) { + memset(storage, 0, sizeof(*storage)); + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { + netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else { + netdev_stats_to_stats64(storage, &dev->stats); + } + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + return storage; +} +EXPORT_SYMBOL(dev_get_stats); + +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + +static const struct ethtool_ops default_ethtool_ops; + +void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops) +{ + if (dev->ethtool_ops == &default_ethtool_ops) + dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + +void netdev_freemem(struct net_device *dev) +{ + char *addr = (char *)dev - dev->padded; + + kvfree(addr); +} + +/** + * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ -struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) { - void *p; struct net_device *dev; - int alloc_size; + size_t alloc_size; + struct net_device *p; - /* ensure 32-byte alignment of both the device and private area */ - alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; - alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + BUG_ON(strlen(name) >= sizeof(dev->name)); - p = kmalloc(alloc_size, GFP_KERNEL); - if (!p) { - printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + if (txqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } - memset(p, 0, alloc_size); - dev = (struct net_device *) - (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); +#ifdef CONFIG_SYSFS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); + return NULL; + } +#endif + + alloc_size = sizeof(struct net_device); + if (sizeof_priv) { + /* ensure 32-byte alignment of private area */ + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); + alloc_size += sizeof_priv; + } + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN - 1; + + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!p) + p = vzalloc(alloc_size); + if (!p) + return NULL; + + dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (sizeof_priv) - dev->priv = netdev_priv(dev); + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + goto free_dev; + + if (dev_addr_init(dev)) + goto free_pcpu; + + dev_mc_init(dev); + dev_uc_init(dev); + dev_net_set(dev, &init_net); + + dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; + + INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); + INIT_LIST_HEAD(&dev->link_watch_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); + dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + +#ifdef CONFIG_SYSFS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; +#endif + strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; return dev; + +free_all: + free_netdev(dev); + return NULL; + +free_pcpu: + free_percpu(dev->pcpu_refcnt); +free_dev: + netdev_freemem(dev); + return NULL; } -EXPORT_SYMBOL(alloc_netdev); +EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device * @dev: device * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. * If this is the last reference then it will be freed. */ void free_netdev(struct net_device *dev) { + struct napi_struct *p, *n; + + release_net(dev_net(dev)); + + netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS - /* Compatiablity with error handling in drivers */ + kfree(dev->_rx); +#endif + + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); + + /* Flush device addresses */ + dev_addr_flush(dev); + + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + netdev_freemem(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); dev->reg_state = NETREG_RELEASED; - /* will free via class release */ - class_device_put(&dev->class_dev); -#else - kfree((char *)dev - dev->padded); -#endif + /* will free via device release */ + put_device(&dev->dev); } - -/* Synchronize with packet receive processing. */ -void synchronize_net(void) +EXPORT_SYMBOL(free_netdev); + +/** + * synchronize_net - Synchronize with packet receive processing + * + * Wait for packets currently being received to be done. + * Does not block later packets from starting. + */ +void synchronize_net(void) { might_sleep(); - synchronize_rcu(); + if (rtnl_is_locked()) + synchronize_rcu_expedited(); + else + synchronize_rcu(); } +EXPORT_SYMBOL(synchronize_net); /** - * unregister_netdevice - remove device from the kernel + * unregister_netdevice_queue - remove device from the kernel * @dev: device + * @head: list * * This function shuts down a device interface and removes it - * from the kernel tables. On success 0 is returned, on a failure - * a negative errno code is returned. + * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ -int unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { - struct net_device *d, **dp; - - BUG_ON(dev_boot_phase); ASSERT_RTNL(); - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); - return -ENODEV; + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); } +} +EXPORT_SYMBOL(unregister_netdevice_queue); - BUG_ON(dev->reg_state != NETREG_REGISTERED); +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; - /* If device is running, close it first. */ - if (dev->flags & IFF_UP) - dev_close(dev); - - /* And unlink it from device chain. */ - for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { - if (d == dev) { - write_lock_bh(&dev_base_lock); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); - if (dev_tail == &dev->next) - dev_tail = dp; - *dp = d->next; - write_unlock_bh(&dev_base_lock); - break; - } + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + list_del(head); } - if (!d) { - printk(KERN_ERR "unregister net_device: '%s' not found\n", - dev->name); - return -ENODEV; +} +EXPORT_SYMBOL(unregister_netdevice_many); + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(unregister_netdev); + +/** + * dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + int err; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + + /* Ensure the device has been registrered */ + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (net_eq(dev_net(dev), net)) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + if (__dev_get_by_name(net, dev->name)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + if (dev_get_valid_name(net, dev, pat) < 0) + goto out; } - dev->reg_state = NETREG_UNREGISTERING; + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + err = -ENODEV; + unlist_netdevice(dev); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); - /* Notify protocols, that we are about to destroy this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. */ - notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); - + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + /* - * Flush the multicast chain + * Flush the unicast and multicast chains */ - dev_mc_discard(dev); + dev_uc_flush(dev); + dev_mc_flush(dev); - if (dev->uninit) - dev->uninit(dev); + /* Send a netdev-removed uevent to the old namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); - /* Notifier chain MUST detach us from master device. */ - BUG_TRAP(!dev->master); + /* Actually switch the network namespace */ + dev_net_set(dev, net); - free_divert_blk(dev); + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) { + int iflink = (dev->iflink == dev->ifindex); + dev->ifindex = dev_new_index(net); + if (iflink) + dev->iflink = dev->ifindex; + } - /* Finish processing unregister after unlock */ - net_set_todo(dev); + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); - synchronize_net(); + /* Fixup kobjects */ + err = device_rename(&dev->dev, dev->name); + WARN_ON(err); - dev_put(dev); - return 0; -} + /* Add the device back in the hashes */ + list_netdevice(dev); -/** - * unregister_netdev - remove device from the kernel - * @dev: device - * - * This function shuts down a device interface and removes it - * from the kernel tables. On success 0 is returned, on a failure - * a negative errno code is returned. - * - * This is just a wrapper for unregister_netdevice that takes - * the rtnl semaphore. In general you want to use this and not - * unregister_netdevice. - */ -void unregister_netdev(struct net_device *dev) -{ - rtnl_lock(); - unregister_netdevice(dev); - rtnl_unlock(); -} + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); -EXPORT_SYMBOL(unregister_netdev); + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); + + synchronize_net(); + err = 0; +out: + return err; +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); -#ifdef CONFIG_HOTPLUG_CPU static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) { struct sk_buff **list_skb; - struct net_device **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; - if (action != CPU_DEAD) + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; local_irq_disable(); @@ -3166,26 +6823,301 @@ static int dev_cpu_callback(struct notifier_block *nfb, *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } + /* Append NAPI poll list from offline CPU. */ + if (!list_empty(&oldsd->poll_list)) { + list_splice_init(&oldsd->poll_list, &sd->poll_list); + raise_softirq_irqoff(NET_RX_SOFTIRQ); + } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) - netif_rx(skb); + while ((skb = __skb_dequeue(&oldsd->process_queue))) { + netif_rx_internal(skb); + input_queue_head_incr(oldsd); + } + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx_internal(skb); + input_queue_head_incr(oldsd); + } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ +/** + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set + * + * Computes a new feature set after adding a device with feature set + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. + */ +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) +{ + if (mask & NETIF_F_GEN_CSUM) + mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_VLAN_CHALLENGED; + + all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; + + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_GEN_CSUM) + all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + + return all; +} +EXPORT_SYMBOL(netdev_increment_features); + +static struct hlist_head * __net_init netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + if (net != &init_net) + INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +/** + * netdev_drivername - network driver for the device + * @dev: network device + * + * Determine network driver for device. + */ +const char *netdev_drivername(const struct net_device *dev) +{ + const struct device_driver *driver; + const struct device *parent; + const char *empty = ""; + + parent = dev->dev.parent; + if (!parent) + return empty; + + driver = parent->driver; + if (driver && driver->name) + return driver->name; + return empty; +} + +static int __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) +{ + int r; + + if (dev && dev->dev.parent) { + r = dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), vaf); + } else if (dev) { + r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + } else { + r = printk("%s(NULL net_device): %pV", level, vaf); + } + + return r; +} + +int netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + r = __netdev_printk(level, dev, &vaf); + + va_end(args); + + return r; +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level) \ +int func(const struct net_device *dev, const char *fmt, ...) \ +{ \ + int r; \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __netdev_printk(level, dev, &vaf); \ + \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit(struct net *net) +{ + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the + * initial network namespace + */ + rtnl_lock(); + for_each_netdev_safe(net, dev, aux) { + int err; + char fb_name[IFNAMSIZ]; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; + + /* Push remaining network devices to init_net */ + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + pr_emerg("%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + BUG(); + } + } + rtnl_unlock(); +} + +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit = default_device_exit, + .exit_batch = default_device_exit_batch, +}; + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -3203,100 +7135,67 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); - net_random_init(); - if (dev_proc_init()) goto out; - if (netdev_sysfs_init()) + if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&ptype_all); - for (i = 0; i < 16; i++) + for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) - INIT_HLIST_HEAD(&dev_name_head[i]); + INIT_LIST_HEAD(&offload_base); - for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) - INIT_HLIST_HEAD(&dev_index_head[i]); + if (register_pernet_subsys(&netdev_net_ops)) + goto out; /* * Initialise the packet receive queues. */ - for (i = 0; i < NR_CPUS; i++) { - struct softnet_data *queue; + for_each_possible_cpu(i) { + struct softnet_data *sd = &per_cpu(softnet_data, i); + + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->cpu = i; +#endif - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); - set_bit(__LINK_STATE_START, &queue->backlog_dev.state); - queue->backlog_dev.weight = weight_p; - queue->backlog_dev.poll = process_backlog; - atomic_set(&queue->backlog_dev.refcnt, 1); + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; } dev_boot_phase = 0; - open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); - open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + + if (register_pernet_device(&default_device_ops)) + goto out; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); dst_init(); - dev_mcast_init(); rc = 0; out: return rc; } subsys_initcall(net_dev_init); - -EXPORT_SYMBOL(__dev_get_by_index); -EXPORT_SYMBOL(__dev_get_by_name); -EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); -EXPORT_SYMBOL(dev_add_pack); -EXPORT_SYMBOL(dev_alloc_name); -EXPORT_SYMBOL(dev_close); -EXPORT_SYMBOL(dev_get_by_flags); -EXPORT_SYMBOL(dev_get_by_index); -EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); -EXPORT_SYMBOL(dev_open); -EXPORT_SYMBOL(dev_queue_xmit); -EXPORT_SYMBOL(dev_remove_pack); -EXPORT_SYMBOL(dev_set_allmulti); -EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_change_flags); -EXPORT_SYMBOL(dev_set_mtu); -EXPORT_SYMBOL(dev_set_mac_address); -EXPORT_SYMBOL(free_netdev); -EXPORT_SYMBOL(netdev_boot_setup_check); -EXPORT_SYMBOL(netdev_set_master); -EXPORT_SYMBOL(netdev_state_change); -EXPORT_SYMBOL(netif_receive_skb); -EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(register_gifconf); -EXPORT_SYMBOL(register_netdevice); -EXPORT_SYMBOL(register_netdevice_notifier); -EXPORT_SYMBOL(skb_checksum_help); -EXPORT_SYMBOL(synchronize_net); -EXPORT_SYMBOL(unregister_netdevice); -EXPORT_SYMBOL(unregister_netdevice_notifier); -EXPORT_SYMBOL(net_enable_timestamp); -EXPORT_SYMBOL(net_disable_timestamp); -EXPORT_SYMBOL(dev_get_flags); - -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) -EXPORT_SYMBOL(br_handle_frame_hook); -EXPORT_SYMBOL(br_fdb_get_hook); -EXPORT_SYMBOL(br_fdb_put_hook); -#endif - -#ifdef CONFIG_KMOD -EXPORT_SYMBOL(dev_load); -#endif - -EXPORT_PER_CPU_SYMBOL(softnet_data); |
