aboutsummaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1923
1 files changed, 1429 insertions, 494 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index a06a7a58dd1..367a586d0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -129,6 +129,9 @@
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
#include "net-sysfs.h"
@@ -144,6 +147,11 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
+
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
* semaphore.
@@ -166,7 +174,13 @@ static struct list_head offload_base __read_mostly;
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
-seqcount_t devnet_rename_seq;
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id;
+static DEFINE_HASHTABLE(napi_hash, 8);
+
+static seqcount_t devnet_rename_seq;
static inline void dev_base_seq_inc(struct net *net)
{
@@ -200,7 +214,7 @@ static inline void rps_unlock(struct softnet_data *sd)
}
/* Device list insertion */
-static int list_netdevice(struct net_device *dev)
+static void list_netdevice(struct net_device *dev)
{
struct net *net = dev_net(dev);
@@ -214,8 +228,6 @@ static int list_netdevice(struct net_device *dev)
write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(net);
-
- return 0;
}
/* Device list removal
@@ -473,7 +485,7 @@ EXPORT_SYMBOL(dev_add_offload);
* and must not be freed until after all the CPU's have gone
* through a quiescent state.
*/
-void __dev_remove_offload(struct packet_offload *po)
+static void __dev_remove_offload(struct packet_offload *po)
{
struct list_head *head = &offload_base;
struct packet_offload *po1;
@@ -491,7 +503,6 @@ void __dev_remove_offload(struct packet_offload *po)
out:
spin_unlock(&offload_lock);
}
-EXPORT_SYMBOL(__dev_remove_offload);
/**
* dev_remove_offload - remove packet offload handler
@@ -794,6 +805,40 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL(dev_get_by_index);
/**
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ *
+ * The use of raw_seqcount_begin() and cond_resched() before
+ * retrying is required as we want to give the writers a chance
+ * to complete when CONFIG_PREEMPT is not set.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ unsigned int seq;
+
+retry:
+ seq = raw_seqcount_begin(&devnet_rename_seq);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ strcpy(name, dev->name);
+ rcu_read_unlock();
+ if (read_seqcount_retry(&devnet_rename_seq, seq)) {
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
* @type: media type of device
@@ -1077,6 +1122,8 @@ rollback:
write_seqcount_end(&devnet_rename_seq);
+ netdev_adjacent_rename_links(dev, oldname);
+
write_lock_bh(&dev_base_lock);
hlist_del_rcu(&dev->name_hlist);
write_unlock_bh(&dev_base_lock);
@@ -1096,6 +1143,7 @@ rollback:
err = ret;
write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
+ memcpy(oldname, newname, IFNAMSIZ);
goto rollback;
} else {
pr_err("%s: name change rollback failed: %d\n",
@@ -1162,8 +1210,12 @@ EXPORT_SYMBOL(netdev_features_change);
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
EXPORT_SYMBOL(netdev_state_change);
@@ -1200,9 +1252,7 @@ static int __dev_open(struct net_device *dev)
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
- ret = netpoll_rx_disable(dev);
- if (ret)
- return ret;
+ netpoll_poll_disable(dev);
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
@@ -1217,7 +1267,7 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
- netpoll_rx_enable(dev);
+ netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1255,7 +1305,7 @@ int dev_open(struct net_device *dev)
if (ret < 0)
return ret;
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
@@ -1269,7 +1319,10 @@ static int __dev_close_many(struct list_head *head)
ASSERT_RTNL();
might_sleep();
- list_for_each_entry(dev, head, unreg_list) {
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1280,12 +1333,12 @@ static int __dev_close_many(struct list_head *head)
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
}
dev_deactivate_many(head);
- list_for_each_entry(dev, head, unreg_list) {
+ list_for_each_entry(dev, head, close_list) {
const struct net_device_ops *ops = dev->netdev_ops;
/*
@@ -1300,6 +1353,7 @@ static int __dev_close_many(struct list_head *head)
dev->flags &= ~IFF_UP;
net_dmaengine_put();
+ netpoll_poll_enable(dev);
}
return 0;
@@ -1310,37 +1364,30 @@ static int __dev_close(struct net_device *dev)
int retval;
LIST_HEAD(single);
- /* Temporarily disable netpoll until the interface is down */
- retval = netpoll_rx_disable(dev);
- if (retval)
- return retval;
-
- list_add(&dev->unreg_list, &single);
+ list_add(&dev->close_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
- netpoll_rx_enable(dev);
return retval;
}
static int dev_close_many(struct list_head *head)
{
struct net_device *dev, *tmp;
- LIST_HEAD(tmp_list);
- list_for_each_entry_safe(dev, tmp, head, unreg_list)
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
if (!(dev->flags & IFF_UP))
- list_move(&dev->unreg_list, &tmp_list);
+ list_del_init(&dev->close_list);
__dev_close_many(head);
- list_for_each_entry(dev, head, unreg_list) {
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
+ list_del_init(&dev->close_list);
}
- /* rollback_registered_many needs the complete original list */
- list_splice(&tmp_list, head);
return 0;
}
@@ -1355,22 +1402,14 @@ static int dev_close_many(struct list_head *head)
*/
int dev_close(struct net_device *dev)
{
- int ret = 0;
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
- /* Block netpoll rx while the interface is going down */
- ret = netpoll_rx_disable(dev);
- if (ret)
- return ret;
-
- list_add(&dev->unreg_list, &single);
+ list_add(&dev->close_list, &single);
dev_close_many(&single);
list_del(&single);
-
- netpoll_rx_enable(dev);
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(dev_close);
@@ -1392,6 +1431,10 @@ void dev_disable_lro(struct net_device *dev)
if (is_vlan_dev(dev))
dev = vlan_dev_real_dev(dev);
+ /* the same for macvlan devices */
+ if (netif_is_macvlan(dev))
+ dev = macvlan_dev_real_dev(dev);
+
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
@@ -1400,6 +1443,14 @@ void dev_disable_lro(struct net_device *dev)
}
EXPORT_SYMBOL(dev_disable_lro);
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return nb->notifier_call(nb, val, &info);
+}
static int dev_boot_phase = 1;
@@ -1432,7 +1483,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
goto unlock;
for_each_net(net) {
for_each_netdev(net, dev) {
- err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
err = notifier_to_errno(err);
if (err)
goto rollback;
@@ -1440,7 +1491,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
if (!(dev->flags & IFF_UP))
continue;
- nb->notifier_call(nb, NETDEV_UP, dev);
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
}
}
@@ -1456,10 +1507,11 @@ rollback:
goto outroll;
if (dev->flags & IFF_UP) {
- nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
- nb->notifier_call(nb, NETDEV_DOWN, dev);
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
- nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
@@ -1497,10 +1549,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev->flags & IFF_UP) {
- nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
- nb->notifier_call(nb, NETDEV_DOWN, dev);
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
- nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
unlock:
@@ -1510,6 +1563,25 @@ unlock:
EXPORT_SYMBOL(unregister_netdevice_notifier);
/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info)
+{
+ ASSERT_RTNL();
+ netdev_notifier_info_init(info, dev);
+ return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
@@ -1520,8 +1592,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- ASSERT_RTNL();
- return raw_notifier_call_chain(&netdev_chain, val, dev);
+ struct netdev_notifier_info info;
+
+ return call_netdevice_notifiers_info(val, dev, &info);
}
EXPORT_SYMBOL(call_netdevice_notifiers);
@@ -1545,7 +1618,6 @@ void net_enable_timestamp(void)
return;
}
#endif
- WARN_ON(in_interrupt());
static_key_slow_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1575,8 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-static inline bool is_skb_forwardable(struct net_device *dev,
- struct sk_buff *skb)
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
{
unsigned int len;
@@ -1595,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev,
return false;
}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ }
+
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
@@ -1616,32 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- }
-
- skb_orphan(skb);
- nf_reset(skb);
-
- if (unlikely(!is_skb_forwardable(dev, skb))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- skb->skb_iif = 0;
- skb->dev = dev;
- skb_dst_drop(skb);
- skb->tstamp.tv64 = 0;
- skb->pkt_type = PACKET_HOST;
- skb->protocol = eth_type_trans(skb, dev);
- skb->mark = 0;
- secpath_reset(skb);
- nf_reset(skb);
- return netif_rx(skb);
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1705,7 +1775,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb2);
if (skb_network_header(skb2) < skb2->data ||
- skb2->network_header > skb2->tail) {
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
ntohs(skb2->protocol),
dev->name);
@@ -1859,7 +1929,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return new_map;
}
-int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
struct xps_map *map, *new_map;
@@ -2020,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -2086,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q)
}
EXPORT_SYMBOL(__netif_schedule);
-void dev_kfree_skb_irq(struct sk_buff *skb)
+struct dev_kfree_skb_cb {
+ enum skb_free_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
- if (atomic_dec_and_test(&skb->users)) {
- struct softnet_data *sd;
- unsigned long flags;
+ unsigned long flags;
- local_irq_save(flags);
- sd = &__get_cpu_var(softnet_data);
- skb->next = sd->completion_queue;
- sd->completion_queue = skb;
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
- local_irq_restore(flags);
+ if (likely(atomic_read(&skb->users) == 1)) {
+ smp_rmb();
+ atomic_set(&skb->users, 0);
+ } else if (likely(!atomic_dec_and_test(&skb->users))) {
+ return;
}
+ get_kfree_skb_cb(skb)->reason = reason;
+ local_irq_save(flags);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(dev_kfree_skb_irq);
+EXPORT_SYMBOL(__dev_kfree_skb_irq);
-void dev_kfree_skb_any(struct sk_buff *skb)
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
if (in_irq() || irqs_disabled())
- dev_kfree_skb_irq(skb);
+ __dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
}
-EXPORT_SYMBOL(dev_kfree_skb_any);
+EXPORT_SYMBOL(__dev_kfree_skb_any);
/**
@@ -2149,6 +2232,9 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
struct net_device *dev = skb->dev;
const char *driver = "";
+ if (!net_ratelimit())
+ return;
+
if (dev && dev->dev.parent)
driver = dev_driver_string(dev->dev.parent);
@@ -2208,6 +2294,53 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
+{
+ unsigned int vlan_depth = skb->mac_len;
+ __be16 type = skb->protocol;
+
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
+
+ eth = (struct ethhdr *)skb_mac_header(skb);
+ type = eth->h_proto;
+ }
+
+ /* if skb->protocol is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
+
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
+
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
+ }
+
+ *depth = vlan_depth;
+
+ return type;
+}
+
/**
* skb_mac_gso_segment - mac layer segmentation handler.
* @skb: buffer to segment
@@ -2218,21 +2351,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
- __be16 type = skb->protocol;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
- while (type == htons(ETH_P_8021Q)) {
- int vlan_depth = ETH_HLEN;
- struct vlan_hdr *vh;
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
- if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
- return ERR_PTR(-EINVAL);
-
- vh = (struct vlan_hdr *)(skb->data + vlan_depth);
- type = vh->h_vlan_encapsulated_proto;
- vlan_depth += VLAN_HLEN;
- }
-
- __skb_pull(skb, skb->mac_len);
+ __skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
@@ -2295,6 +2420,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
}
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
@@ -2357,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
{
struct dev_gso_cb *cb;
- do {
- struct sk_buff *nskb = skb->next;
-
- skb->next = nskb->next;
- nskb->next = NULL;
- kfree_skb(nskb);
- } while (skb->next);
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
cb = DEV_GSO_CB(skb);
if (cb->destructor)
@@ -2398,24 +2520,40 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
return 0;
}
-static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#ifdef CONFIG_NET_MPLS_GSO
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
{
- return ((features & NETIF_F_GEN_CSUM) ||
- ((features & NETIF_F_V4_CSUM) &&
- protocol == htons(ETH_P_IP)) ||
- ((features & NETIF_F_V6_CSUM) &&
- protocol == htons(ETH_P_IPV6)) ||
- ((features & NETIF_F_FCOE_CRC) &&
- protocol == htons(ETH_P_FCOE)));
+ return features;
}
+#endif
static netdev_features_t harmonize_features(struct sk_buff *skb,
- __be16 protocol, netdev_features_t features)
+ netdev_features_t features)
{
+ int tmp;
+ __be16 type;
+
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
+
if (skb->ip_summed != CHECKSUM_NONE &&
- !can_checksum_protocol(features, protocol)) {
+ !can_checksum_protocol(features, type)) {
features &= ~NETIF_F_ALL_CSUM;
- features &= ~NETIF_F_SG;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -2431,39 +2569,24 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
features &= ~NETIF_F_GSO_MASK;
- if (protocol == htons(ETH_P_8021Q)) {
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
} else if (!vlan_tx_tag_present(skb)) {
- return harmonize_features(skb, protocol, features);
+ return harmonize_features(skb, features);
}
- features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
- if (protocol != htons(ETH_P_8021Q)) {
- return harmonize_features(skb, protocol, features);
- } else {
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
- return harmonize_features(skb, protocol, features);
- }
-}
-EXPORT_SYMBOL(netif_skb_features);
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
-/*
- * Returns true if either:
- * 1. skb has frag_list and the device doesn't support FRAGLIST, or
- * 2. skb is fragmented and the device does not support SG.
- */
-static inline int skb_needs_linearize(struct sk_buff *skb,
- int features)
-{
- return skb_is_nonlinear(skb) &&
- ((skb_has_frag_list(skb) &&
- !(features & NETIF_F_FRAGLIST)) ||
- (skb_shinfo(skb)->nr_frags &&
- !(features & NETIF_F_SG)));
+ return harmonize_features(skb, features);
}
+EXPORT_SYMBOL(netif_skb_features);
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
@@ -2485,8 +2608,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
- !(features & NETIF_F_HW_VLAN_TX)) {
- skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
@@ -2531,6 +2655,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
dev_queue_xmit_nit(skb, dev);
skb_len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
rc = ops->ndo_start_xmit(skb, dev);
trace_net_dev_xmit(skb, rc, dev, skb_len);
if (rc == NETDEV_TX_OK)
@@ -2545,17 +2670,11 @@ gso:
skb->next = nskb->next;
nskb->next = NULL;
- /*
- * If device doesn't need nskb->dst, release it right now while
- * its hot in this cpu cache
- */
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(nskb);
-
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(nskb, dev);
skb_len = nskb->len;
+ trace_net_dev_start_xmit(nskb, dev);
rc = ops->ndo_start_xmit(nskb, dev);
trace_net_dev_xmit(nskb, rc, dev, skb_len);
if (unlikely(rc != NETDEV_TX_OK)) {
@@ -2571,13 +2690,17 @@ gso:
} while (skb->next);
out_kfree_gso_skb:
- if (likely(skb->next == NULL))
+ if (likely(skb->next == NULL)) {
skb->destructor = DEV_GSO_CB(skb)->destructor;
+ consume_skb(skb);
+ return rc;
+ }
out_kfree_skb:
kfree_skb(skb);
out:
return rc;
}
+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
@@ -2590,6 +2713,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
*/
if (shinfo->gso_size) {
unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
/* mac layer + network layer */
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
@@ -2599,7 +2723,12 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
hdr_len += tcp_hdrlen(skb);
else
hdr_len += sizeof(struct udphdr);
- qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
+
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
@@ -2666,7 +2795,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc;
}
-#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
@@ -2703,8 +2832,9 @@ int dev_loopback_xmit(struct sk_buff *skb)
EXPORT_SYMBOL(dev_loopback_xmit);
/**
- * dev_queue_xmit - transmit a buffer
+ * __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
+ * @accel_priv: private data used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
@@ -2727,7 +2857,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-int dev_queue_xmit(struct sk_buff *skb)
+static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -2743,7 +2873,7 @@ int dev_queue_xmit(struct sk_buff *skb)
skb_update_prio(skb);
- txq = netdev_pick_tx(dev, skb);
+ txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
#ifdef CONFIG_NET_CLS_ACT
@@ -2802,14 +2932,26 @@ recursion_alert:
rc = -ENETDOWN;
rcu_read_unlock_bh();
+ atomic_long_inc(&dev->tx_dropped);
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
+
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ return __dev_queue_xmit(skb, NULL);
+}
EXPORT_SYMBOL(dev_queue_xmit);
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+{
+ return __dev_queue_xmit(skb, accel_priv);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
/*=======================================================================
Receiver routines
@@ -2863,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb->rxhash & flow_table->mask;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -2897,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u16 tcpu;
+ u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
@@ -2925,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
skb_reset_network_header(skb);
- if (!skb_get_rxhash(skb))
+ hash = skb_get_hash(skb);
+ if (!hash)
goto done;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
@@ -2934,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
u16 next_cpu;
struct rps_dev_flow *rflow;
- rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[skb->rxhash &
- sock_flow_table->mask];
+ next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
/*
* If the desired CPU (where last recvmsg was done) is
@@ -2967,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) hash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
@@ -3051,6 +3194,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 0;
}
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = &__get_cpu_var(softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
@@ -3060,13 +3243,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
{
struct softnet_data *sd;
unsigned long flags;
+ unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
- if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -3096,29 +3281,10 @@ enqueue:
return NET_RX_DROP;
}
-/**
- * netif_rx - post buffer to the network code
- * @skb: buffer to post
- *
- * This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
- *
- * return values:
- * NET_RX_SUCCESS (no congestion)
- * NET_RX_DROP (packet was dropped)
- *
- */
-
-int netif_rx(struct sk_buff *skb)
+static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
-
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
@@ -3147,14 +3313,38 @@ int netif_rx(struct sk_buff *skb)
}
return ret;
}
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ trace_netif_rx_entry(skb);
+
+ return netif_rx_internal(skb);
+}
EXPORT_SYMBOL(netif_rx);
int netif_rx_ni(struct sk_buff *skb)
{
int err;
+ trace_netif_rx_ni_entry(skb);
+
preempt_disable();
- err = netif_rx(skb);
+ err = netif_rx_internal(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
@@ -3180,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h)
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
- trace_kfree_skb(skb, net_tx_action);
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
@@ -3202,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q);
@@ -3212,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)
&q->state)) {
__netif_reschedule(q);
} else {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
@@ -3298,7 +3491,7 @@ out:
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
- * Register a receive hander for a device. This handler will then be
+ * Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
@@ -3315,6 +3508,7 @@ int netdev_rx_handler_register(struct net_device *dev,
if (dev->rx_handler)
return -EBUSY;
+ /* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
@@ -3326,7 +3520,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
- * Unregister a receive hander from a device.
+ * Unregister a receive handler from a device.
*
* The caller must hold the rtnl_mutex.
*/
@@ -3335,6 +3529,11 @@ void netdev_rx_handler_unregister(struct net_device *dev)
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
@@ -3346,10 +3545,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
- case __constant_htons(ETH_P_ARP):
- case __constant_htons(ETH_P_IP):
- case __constant_htons(ETH_P_IPV6):
- case __constant_htons(ETH_P_8021Q):
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
return true;
default:
return false;
@@ -3370,10 +3570,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
trace_netif_receive_skb(skb);
- /* if we've gotten here through NAPI, check netpoll */
- if (netpoll_receive_skb(skb))
- goto out;
-
orig_dev = skb->dev;
skb_reset_network_header(skb);
@@ -3390,7 +3586,8 @@ another_round:
__this_cpu_inc(softnet_data.processed);
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
+ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
goto unlock;
@@ -3444,6 +3641,7 @@ ncls:
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
goto unlock;
case RX_HANDLER_ANOTHER:
goto another_round;
@@ -3456,8 +3654,15 @@ ncls:
}
}
- if (vlan_tx_nonzero_tag_present(skb))
- skb->pkt_type = PACKET_OTHERHOST;
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ if (vlan_tx_tag_get_id(skb))
+ skb->pkt_type = PACKET_OTHERHOST;
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ skb->vlan_tci = 0;
+ }
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
@@ -3491,7 +3696,6 @@ drop:
unlock:
rcu_read_unlock();
-out:
return ret;
}
@@ -3520,22 +3724,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
return ret;
}
-/**
- * netif_receive_skb - process receive buffer from network
- * @skb: buffer to process
- *
- * netif_receive_skb() is the main receive data processing function.
- * It always succeeds. The buffer may be dropped during processing
- * for congestion control or by the protocol layers.
- *
- * This function may only be called from softirq context and interrupts
- * should be enabled.
- *
- * Return values (usually ignored):
- * NET_RX_SUCCESS: no congestion
- * NET_RX_DROP: packet was dropped
- */
-int netif_receive_skb(struct sk_buff *skb)
+static int netif_receive_skb_internal(struct sk_buff *skb)
{
net_timestamp_check(netdev_tstamp_prequeue, skb);
@@ -3561,6 +3750,28 @@ int netif_receive_skb(struct sk_buff *skb)
#endif
return __netif_receive_skb(skb);
}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ trace_netif_receive_skb_entry(skb);
+
+ return netif_receive_skb_internal(skb);
+}
EXPORT_SYMBOL(netif_receive_skb);
/* Network device is going away, flush any packets still pending
@@ -3610,7 +3821,7 @@ static int napi_gro_complete(struct sk_buff *skb)
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
- err = ptype->callbacks.gro_complete(skb);
+ err = ptype->callbacks.gro_complete(skb, 0);
break;
}
rcu_read_unlock();
@@ -3622,7 +3833,7 @@ static int napi_gro_complete(struct sk_buff *skb)
}
out:
- return netif_receive_skb(skb);
+ return netif_receive_skb_internal(skb);
}
/* napi->gro_list contains packets ordered by age.
@@ -3658,21 +3869,66 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff *p;
unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
for (p = napi->gro_list; p; p = p->next) {
unsigned long diffs;
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
- skb_gro_mac_header(skb));
+ skb_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
- skb_gro_mac_header(skb),
+ skb_mac_header(skb),
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
- NAPI_GRO_CB(p)->flush = 0;
+ }
+}
+
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+ pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
}
}
@@ -3684,14 +3940,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
struct list_head *head = &offload_base;
int same_flow;
enum gro_result ret;
+ int grow;
- if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+ if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal;
gro_list_prepare(napi, skb);
+ NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -3703,6 +3961,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->udp_mark = 0;
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
@@ -3727,39 +3986,35 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (same_flow)
goto ok;
- if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
+ if (NAPI_GRO_CB(skb)->flush)
goto normal;
- napi->gro_count++;
+ if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
+ struct sk_buff *nskb = napi->gro_list;
+
+ /* locate the end of the list to select the 'oldest' flow */
+ while (nskb->next) {
+ pp = &nskb->next;
+ nskb = *pp;
+ }
+ *pp = NULL;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ } else {
+ napi->gro_count++;
+ }
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
ret = GRO_HELD;
pull:
- if (skb_headlen(skb) < skb_gro_offset(skb)) {
- int grow = skb_gro_offset(skb) - skb_headlen(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->tail += grow;
- skb->data_len -= grow;
-
- skb_shinfo(skb)->frags[0].page_offset += grow;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(skb_shinfo(skb)->frags,
- skb_shinfo(skb)->frags + 1,
- --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
- }
- }
-
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
ok:
return ret;
@@ -3768,12 +4023,39 @@ normal:
goto pull;
}
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb(skb))
+ if (netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
@@ -3796,25 +4078,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
return ret;
}
-static void skb_gro_reset_offset(struct sk_buff *skb)
-{
- const struct skb_shared_info *pinfo = skb_shinfo(skb);
- const skb_frag_t *frag0 = &pinfo->frags[0];
-
- NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
-
- if (skb->mac_header == skb->tail &&
- pinfo->nr_frags &&
- !PageHighMem(skb_frag_page(frag0))) {
- NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
- }
-}
-
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ trace_napi_gro_receive_entry(skb);
+
skb_gro_reset_offset(skb);
return napi_skb_finish(dev_gro_receive(napi, skb), skb);
@@ -3829,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->vlan_tci = 0;
skb->dev = napi->dev;
skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
napi->skb = skb;
}
@@ -3839,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
if (!skb) {
skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
- if (skb)
- napi->skb = skb;
+ napi->skb = skb;
}
return skb;
}
EXPORT_SYMBOL(napi_get_frags);
-static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
-
- if (ret == GRO_HELD)
- skb_gro_pull(skb, -ETH_HLEN);
- else if (netif_receive_skb(skb))
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
@@ -3872,39 +4140,42 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
return ret;
}
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
- struct ethhdr *eth;
- unsigned int hlen;
- unsigned int off;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb);
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*eth);
- eth = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- eth = skb_gro_header_slow(skb, hlen, off);
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
napi_reuse_skb(napi, skb);
- skb = NULL;
- goto out;
+ return NULL;
}
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
-
- skb_gro_pull(skb, sizeof(*eth));
+ __skb_pull(skb, hlen);
/*
* This works because the only protocols we care about don't require
- * special handling. We'll fix it up properly at the end.
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
*/
skb->protocol = eth->h_proto;
-out:
return skb;
}
@@ -3915,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
if (!skb)
return GRO_DROP;
+ trace_napi_gro_frags_entry(skb);
+
return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
}
EXPORT_SYMBOL(napi_gro_frags);
/*
- * net_rps_action sends any pending IPI's for rps.
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
@@ -3938,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
- __smp_call_function_single(remsd->cpu,
- &remsd->csd, 0);
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
remsd = next;
}
} else
@@ -3963,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
#endif
napi->weight = weight_p;
local_irq_disable();
- while (work < quota) {
+ while (1) {
struct sk_buff *skb;
- unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
@@ -3979,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
rps_lock(sd);
- qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen)
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
-
- if (qlen < quota - work) {
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
- * and NAPI_STATE_SCHED is the only possible flag set on backlog.
- * we can use a plain write instead of clear_bit(),
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
+ rps_unlock(sd);
- quota = work + qlen;
+ break;
}
+
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
rps_unlock(sd);
}
local_irq_enable();
@@ -4026,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)
BUG_ON(n->gro_list);
list_del(&n->poll_list);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
@@ -4049,6 +4321,58 @@ void napi_complete(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_complete);
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(napi_by_id);
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0 is not a valid id, we also skip an id that is taken
+ * we expect both events to be extremely rare
+ */
+ napi->napi_id = 0;
+ while (!napi->napi_id) {
+ napi->napi_id = ++napi_gen_id;
+ if (napi_by_id(napi->napi_id))
+ napi->napi_id = 0;
+ }
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(napi_hash_add);
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+void napi_hash_del(struct napi_struct *napi)
+{
+ spin_lock(&napi_hash_lock);
+
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ hlist_del_rcu(&napi->napi_hash_node);
+
+ spin_unlock(&napi_hash_lock);
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -4057,6 +4381,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->gro_list = NULL;
napi->skb = NULL;
napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
@@ -4070,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add);
void netif_napi_del(struct napi_struct *napi)
{
- struct sk_buff *skb, *next;
-
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- for (skb = napi->gro_list; skb; skb = next) {
- next = skb->next;
- skb->next = NULL;
- kfree_skb(skb);
- }
-
+ kfree_skb_list(napi->gro_list);
napi->gro_list = NULL;
napi->gro_count = 0;
}
@@ -4103,7 +4423,7 @@ static void net_rx_action(struct softirq_action *h)
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
- if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
+ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
@@ -4181,55 +4501,31 @@ softnet_break:
goto out;
}
-struct netdev_upper {
+struct netdev_adjacent {
struct net_device *dev;
- bool master;
- struct list_head list;
- struct rcu_head rcu;
- struct list_head search_list;
-};
-static void __append_search_uppers(struct list_head *search_list,
- struct net_device *dev)
-{
- struct netdev_upper *upper;
+ /* upper master flag, there can only be one master device per list */
+ bool master;
- list_for_each_entry(upper, &dev->upper_dev_list, list) {
- /* check if this upper is not already in search list */
- if (list_empty(&upper->search_list))
- list_add_tail(&upper->search_list, search_list);
- }
-}
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
-static bool __netdev_search_upper_dev(struct net_device *dev,
- struct net_device *upper_dev)
-{
- LIST_HEAD(search_list);
- struct netdev_upper *upper;
- struct netdev_upper *tmp;
- bool ret = false;
+ /* private field for the users */
+ void *private;
- __append_search_uppers(&search_list, dev);
- list_for_each_entry(upper, &search_list, search_list) {
- if (upper->dev == upper_dev) {
- ret = true;
- break;
- }
- __append_search_uppers(&search_list, upper->dev);
- }
- list_for_each_entry_safe(upper, tmp, &search_list, search_list)
- INIT_LIST_HEAD(&upper->search_list);
- return ret;
-}
+ struct list_head list;
+ struct rcu_head rcu;
+};
-static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
- struct net_device *upper_dev)
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *adj_list)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *adj;
- list_for_each_entry(upper, &dev->upper_dev_list, list) {
- if (upper->dev == upper_dev)
- return upper;
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
}
return NULL;
}
@@ -4248,7 +4544,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_upper(dev, upper_dev);
+ return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
@@ -4259,13 +4555,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
* Find out if a device is linked to an upper device and return true in case
* it is. The caller must hold the RTNL lock.
*/
-bool netdev_has_any_upper_dev(struct net_device *dev)
+static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
- return !list_empty(&dev->upper_dev_list);
+ return !list_empty(&dev->all_adj_list.upper);
}
-EXPORT_SYMBOL(netdev_has_any_upper_dev);
/**
* netdev_master_upper_dev_get - Get master upper device
@@ -4276,21 +4571,185 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev);
*/
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *upper;
ASSERT_RTNL();
- if (list_empty(&dev->upper_dev_list))
+ if (list_empty(&dev->adj_list.upper))
return NULL;
- upper = list_first_entry(&dev->upper_dev_list,
- struct netdev_upper, list);
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
if (likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+void *netdev_adjacent_get_private(struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
+
+ return adj->private;
+}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
+
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+/**
+ * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->all_adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
+
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
+
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
+
/**
* netdev_master_upper_dev_get_rcu - Get master upper device
* @dev: device
@@ -4300,20 +4759,217 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get);
*/
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *upper;
- upper = list_first_or_null_rcu(&dev->upper_dev_list,
- struct netdev_upper, list);
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
if (upper && likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
+}
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
+}
+
+#define netdev_adjacent_is_neigh_list(dev, dev_list) \
+ (dev_list == &dev->adj_list.upper || \
+ dev_list == &dev->adj_list.lower)
+
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
+{
+ struct netdev_adjacent *adj;
+ int ret;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr++;
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ dev_hold(adj_dev);
+
+ pr_debug("dev_hold for %s, because of link added from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
+ if (master) {
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
+ }
+
+ return 0;
+
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ kfree(adj);
+ dev_put(adj_dev);
+
+ return ret;
+}
+
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (!adj) {
+ pr_err("tried to remove device %s from %s\n",
+ dev->name, adj_dev->name);
+ BUG();
+ }
+
+ if (adj->ref_nr > 1) {
+ pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
+ adj->ref_nr-1);
+ adj->ref_nr--;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
+
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
+}
+
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
+{
+ int ret;
+
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
+ master);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
+ false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __netdev_adjacent_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower,
+ NULL, false);
+}
+
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list)
+{
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+}
+
+static void __netdev_adjacent_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower);
+}
+
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
+{
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+ if (ret) {
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
+}
+
static int __netdev_upper_dev_link(struct net_device *dev,
- struct net_device *upper_dev, bool master)
+ struct net_device *upper_dev, bool master,
+ void *private)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *i, *j, *to_i, *to_j;
+ int ret = 0;
ASSERT_RTNL();
@@ -4321,31 +4977,92 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_search_upper_dev(upper_dev, dev))
+ if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
- if (__netdev_find_upper(dev, upper_dev))
+ if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
return -EBUSY;
- upper = kmalloc(sizeof(*upper), GFP_KERNEL);
- if (!upper)
- return -ENOMEM;
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ master);
+ if (ret)
+ return ret;
- upper->dev = upper_dev;
- upper->master = master;
- INIT_LIST_HEAD(&upper->search_list);
+ /* Now that we linked these devs, make all the upper_dev's
+ * all_adj_list.upper visible to every dev's all_adj_list.lower an
+ * versa, and don't forget the devices itself. All of these
+ * links are non-neighbours.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("Interlinking %s with %s, non-neighbour\n",
+ i->dev->name, j->dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ if (ret)
+ goto rollback_mesh;
+ }
+ }
- /* Ensure that master upper link is always the first item in list. */
- if (master)
- list_add_rcu(&upper->list, &dev->upper_dev_list);
- else
- list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
- dev_hold(upper_dev);
+ /* add dev to every upper_dev's upper device */
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("linking %s's upper device %s with %s\n",
+ upper_dev->name, i->dev->name, dev->name);
+ ret = __netdev_adjacent_dev_link(dev, i->dev);
+ if (ret)
+ goto rollback_upper_mesh;
+ }
+
+ /* add upper_dev to every dev's lower device */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ pr_debug("linking %s's lower device %s with %s\n", dev->name,
+ i->dev->name, upper_dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ if (ret)
+ goto rollback_lower_mesh;
+ }
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
return 0;
+
+rollback_lower_mesh:
+ to_i = i;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ }
+
+ i = NULL;
+
+rollback_upper_mesh:
+ to_i = i;
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+ }
+
+ i = j = NULL;
+
+rollback_mesh:
+ to_i = i;
+ to_j = j;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i && j == to_j)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ }
+ if (i == to_i)
+ break;
+ }
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
}
/**
@@ -4361,7 +5078,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
- return __netdev_upper_dev_link(dev, upper_dev, false);
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -4379,10 +5096,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
int netdev_master_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
- return __netdev_upper_dev_link(dev, upper_dev, true);
+ return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
+int netdev_master_upper_dev_link_private(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, private);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
+
/**
* netdev_upper_dev_unlink - Removes a link to upper device
* @dev: device
@@ -4394,28 +5119,99 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
- struct netdev_upper *upper;
-
+ struct netdev_adjacent *i, *j;
ASSERT_RTNL();
- upper = __netdev_find_upper(dev, upper_dev);
- if (!upper)
- return;
- list_del_rcu(&upper->list);
- dev_put(upper_dev);
- kfree_rcu(upper, rcu);
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ /* Here is the tricky part. We must remove all dev's lower
+ * devices from all upper_dev's upper devices and vice
+ * versa, to maintain the graph relationship.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+ /* remove also the devices itself from lower/upper device
+ * list
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+ struct netdev_adjacent *iter;
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
+
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
+{
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
+{
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
+
+ ASSERT_RTNL();
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
+ }
+
+ if (type_check(dev))
+ max_nest++;
+
+ return max_nest;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
- if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
+ if (ops->ndo_change_rx_flags)
ops->ndo_change_rx_flags(dev, flags);
}
-static int __dev_set_promiscuity(struct net_device *dev, int inc)
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
kuid_t uid;
@@ -4458,6 +5254,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
dev_change_rx_flags(dev, IFF_PROMISC);
}
+ if (notify)
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC);
return 0;
}
@@ -4477,7 +5275,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
unsigned int old_flags = dev->flags;
int err;
- err = __dev_set_promiscuity(dev, inc);
+ err = __dev_set_promiscuity(dev, inc, true);
if (err < 0)
return err;
if (dev->flags != old_flags)
@@ -4486,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
}
EXPORT_SYMBOL(dev_set_promiscuity);
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
+static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
- unsigned int old_flags = dev->flags;
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
ASSERT_RTNL();
@@ -4524,9 +5309,30 @@ int dev_set_allmulti(struct net_device *dev, int inc)
if (dev->flags ^ old_flags) {
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags);
}
return 0;
}
+
+/**
+ * dev_set_allmulti - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ return __dev_set_allmulti(dev, inc, true);
+}
EXPORT_SYMBOL(dev_set_allmulti);
/*
@@ -4551,10 +5357,10 @@ void __dev_set_rx_mode(struct net_device *dev)
* therefore calling __dev_set_promiscuity here is safe.
*/
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1);
+ __dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1);
+ __dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
@@ -4643,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ unsigned int old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
- dev_set_promiscuity(dev, inc);
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
@@ -4656,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- dev_set_allmulti(dev, inc);
+ __dev_set_allmulti(dev, inc, false);
}
return ret;
}
-void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges)
{
unsigned int changes = dev->flags ^ old_flags;
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
call_netdevice_notifiers(NETDEV_UP, dev);
@@ -4674,8 +5488,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
}
if (dev->flags & IFF_UP &&
- (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = changes;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ }
}
/**
@@ -4689,21 +5508,29 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
int dev_change_flags(struct net_device *dev, unsigned int flags)
{
int ret;
- unsigned int changes, old_flags = dev->flags;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
return ret;
- changes = old_flags ^ dev->flags;
- if (changes)
- rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
-
- __dev_notify_flags(dev, old_flags);
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes);
return ret;
}
EXPORT_SYMBOL(dev_change_flags);
+static int __dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
/**
* dev_set_mtu - Change maximum transfer unit
* @dev: device
@@ -4713,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags);
*/
int dev_set_mtu(struct net_device *dev, int new_mtu)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- int err;
+ int err, orig_mtu;
if (new_mtu == dev->mtu)
return 0;
@@ -4726,14 +5552,25 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (!netif_device_present(dev))
return -ENODEV;
- err = 0;
- if (ops->ndo_change_mtu)
- err = ops->ndo_change_mtu(dev, new_mtu);
- else
- dev->mtu = new_mtu;
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
- if (!err)
- call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ orig_mtu = dev->mtu;
+ err = __dev_set_mtu(dev, new_mtu);
+
+ if (!err) {
+ err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __dev_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ }
+ }
return err;
}
EXPORT_SYMBOL(dev_set_mtu);
@@ -4780,7 +5617,7 @@ EXPORT_SYMBOL(dev_set_mac_address);
/**
* dev_change_carrier - Change device carrier
* @dev: device
- * @new_carries: new value
+ * @new_carrier: new value
*
* Change device carrier
*/
@@ -4797,6 +5634,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
EXPORT_SYMBOL(dev_change_carrier);
/**
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
+ *
+ * Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_port_id *ppid)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
@@ -4817,15 +5672,18 @@ static int dev_new_index(struct net *net)
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
}
static void rollback_registered_many(struct list_head *head)
{
struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
@@ -4848,7 +5706,9 @@ static void rollback_registered_many(struct list_head *head)
}
/* If device is running, close it first. */
- dev_close_many(head);
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
@@ -4869,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
-
/*
* Flush the unicast and multicast chains
*/
@@ -4882,6 +5738,10 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -4918,20 +5778,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
- /* Fix illegal SG+CSUM combinations. */
- if ((features & NETIF_F_SG) &&
- !(features & NETIF_F_ALL_CSUM)) {
- netdev_dbg(dev,
- "Dropping NETIF_F_SG since no checksum feature.\n");
- features &= ~NETIF_F_SG;
- }
-
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
features &= ~NETIF_F_ALL_TSO;
}
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
+
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
+
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
@@ -4960,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
+
return features;
}
@@ -5059,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
@@ -5093,17 +5965,25 @@ static void netdev_init_one_queue(struct net_device *dev,
#endif
}
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
+}
+
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
- BUG_ON(count < 1);
-
- tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx)
- return -ENOMEM;
+ BUG_ON(count < 1 || count > 0xffff);
+ tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!tx) {
+ tx = vzalloc(sz);
+ if (!tx)
+ return -ENOMEM;
+ }
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5162,7 +6042,8 @@ int register_netdevice(struct net_device *dev)
}
}
- if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
@@ -5186,19 +6067,22 @@ int register_netdevice(struct net_device *dev)
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
- /* Turn on no cache copy if HW is doing checksum */
if (!(dev->flags & IFF_LOOPBACK)) {
dev->hw_features |= NETIF_F_NOCACHE_COPY;
- if (dev->features & NETIF_F_ALL_CSUM) {
- dev->wanted_features |= NETIF_F_NOCACHE_COPY;
- dev->features |= NETIF_F_NOCACHE_COPY;
- }
}
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG;
+
+ /* Make NETIF_F_SG inheritable to MPLS.
+ */
+ dev->mpls_features |= NETIF_F_SG;
+
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
@@ -5245,7 +6129,7 @@ int register_netdevice(struct net_device *dev)
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
@@ -5463,6 +6347,12 @@ void netdev_run_todo(void)
if (dev->destructor)
dev->destructor(dev);
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
/* Free network device */
kobject_put(&dev->dev.kobj);
}
@@ -5514,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -5546,6 +6437,13 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ kvfree(addr);
+}
+
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
@@ -5555,7 +6453,7 @@ EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
* @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization. Also allocates subquue structs
+ * and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
@@ -5573,7 +6471,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
@@ -5589,7 +6487,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kzalloc(alloc_size, GFP_KERNEL);
+ p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!p)
+ p = vzalloc(alloc_size);
if (!p)
return NULL;
@@ -5598,7 +6498,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
- goto free_p;
+ goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
@@ -5613,8 +6513,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
- INIT_LIST_HEAD(&dev->upper_dev_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->all_adj_list.upper);
+ INIT_LIST_HEAD(&dev->all_adj_list.lower);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
@@ -5623,7 +6527,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (netif_alloc_netdev_queues(dev))
goto free_all;
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
@@ -5642,13 +6546,8 @@ free_all:
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
-#ifdef CONFIG_RPS
- kfree(dev->_rx);
-#endif
-
-free_p:
- kfree(p);
+free_dev:
+ netdev_freemem(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
@@ -5667,8 +6566,8 @@ void free_netdev(struct net_device *dev)
release_net(dev_net(dev));
- kfree(dev->_tx);
-#ifdef CONFIG_RPS
+ netif_free_tx_queues(dev);
+#ifdef CONFIG_SYSFS
kfree(dev->_rx);
#endif
@@ -5685,7 +6584,7 @@ void free_netdev(struct net_device *dev)
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
return;
}
@@ -5743,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -5752,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)
rollback_registered_many(head);
list_for_each_entry(dev, head, unreg_list)
net_set_todo(dev);
+ list_del(head);
}
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -5847,7 +6750,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
/*
* Flush the unicast and multicast chains
@@ -5886,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
synchronize_net();
err = 0;
@@ -5938,11 +6841,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
- netif_rx(skb);
+ netif_rx_internal(skb);
input_queue_head_incr(oldsd);
}
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
- netif_rx(skb);
+ netif_rx_internal(skb);
input_queue_head_incr(oldsd);
}
@@ -5978,7 +6881,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
}
EXPORT_SYMBOL(netdev_increment_features);
-static struct hlist_head *netdev_create_hash(void)
+static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
struct hlist_head *hash;
@@ -6147,6 +7050,34 @@ static void __net_exit default_device_exit(struct net *net)
rtnl_unlock();
}
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
@@ -6158,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- rtnl_lock();
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops)
@@ -6168,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
}
unregister_netdevice_many(&dev_kill_list);
- list_del(&dev_kill_list);
rtnl_unlock();
}
@@ -6216,24 +7157,18 @@ static int __init net_dev_init(void)
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
- memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
- sd->completion_queue = NULL;
INIT_LIST_HEAD(&sd->poll_list);
- sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
sd->csd.func = rps_trigger_softirq;
sd->csd.info = sd;
- sd->csd.flags = 0;
sd->cpu = i;
#endif
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
- sd->backlog.gro_list = NULL;
- sd->backlog.gro_count = 0;
}
dev_boot_phase = 0;