aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c26
-rw-r--r--net/core/dev.c438
-rw-r--r--net/core/dev_addr_lists.c85
-rw-r--r--net/core/dst.c31
-rw-r--r--net/core/ethtool.c216
-rw-r--r--net/core/fib_rules.c7
-rw-r--r--net/core/filter.c1857
-rw-r--r--net/core/flow.c140
-rw-r--r--net/core/flow_dissector.c44
-rw-r--r--net/core/iovec.c61
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c32
-rw-r--r--net/core/net-sysfs.c24
-rw-r--r--net/core/net_namespace.c4
-rw-r--r--net/core/netclassid_cgroup.c17
-rw-r--r--net/core/netpoll.c591
-rw-r--r--net/core/netprio_cgroup.c53
-rw-r--r--net/core/pktgen.c92
-rw-r--r--net/core/ptp_classifier.c141
-rw-r--r--net/core/request_sock.c1
-rw-r--r--net/core/rtnetlink.c283
-rw-r--r--net/core/secure_seq.c25
-rw-r--r--net/core/skbuff.c346
-rw-r--r--net/core/sock.c68
-rw-r--r--net/core/sock_diag.c27
-rw-r--r--net/core/timestamping.c19
-rw-r--r--net/core/tso.c77
-rw-r--r--net/core/utils.c8
29 files changed, 2904 insertions, 1814 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 9628c20acff..71093d94ad2 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o
+ sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
@@ -21,5 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a16ed7bbe37..488dd1a825c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -740,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
+ skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
- return __skb_checksum_complete_head(skb, skb->len);
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+
+ return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3721db71635..367a586d0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -148,6 +148,9 @@ struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -1207,7 +1210,11 @@ EXPORT_SYMBOL(netdev_features_change);
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
@@ -1245,7 +1252,7 @@ static int __dev_open(struct net_device *dev)
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
- netpoll_rx_disable(dev);
+ netpoll_poll_disable(dev);
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
@@ -1260,7 +1267,7 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
- netpoll_rx_enable(dev);
+ netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1313,6 +1320,9 @@ static int __dev_close_many(struct list_head *head)
might_sleep();
list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1323,7 +1333,7 @@ static int __dev_close_many(struct list_head *head)
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
}
dev_deactivate_many(head);
@@ -1343,6 +1353,7 @@ static int __dev_close_many(struct list_head *head)
dev->flags &= ~IFF_UP;
net_dmaengine_put();
+ netpoll_poll_enable(dev);
}
return 0;
@@ -1353,14 +1364,10 @@ static int __dev_close(struct net_device *dev)
int retval;
LIST_HEAD(single);
- /* Temporarily disable netpoll until the interface is down */
- netpoll_rx_disable(dev);
-
list_add(&dev->close_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
- netpoll_rx_enable(dev);
return retval;
}
@@ -1398,14 +1405,9 @@ int dev_close(struct net_device *dev)
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
- /* Block netpoll rx while the interface is going down */
- netpoll_rx_disable(dev);
-
list_add(&dev->close_list, &single);
dev_close_many(&single);
list_del(&single);
-
- netpoll_rx_enable(dev);
}
return 0;
}
@@ -1645,8 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-static inline bool is_skb_forwardable(struct net_device *dev,
- struct sk_buff *skb)
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
{
unsigned int len;
@@ -1665,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev,
return false;
}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ }
+
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
@@ -1686,24 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- }
-
- if (unlikely(!is_skb_forwardable(dev, skb))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
-
- skb_scrub_packet(skb, true);
- skb->protocol = eth_type_trans(skb, dev);
-
- return netif_rx_internal(skb);
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -2286,10 +2294,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-__be16 skb_network_protocol(struct sk_buff *skb)
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
+ unsigned int vlan_depth = skb->mac_len;
__be16 type = skb->protocol;
- int vlan_depth = ETH_HLEN;
/* Tunnel gso handlers can set protocol to ethernet. */
if (type == htons(ETH_P_TEB)) {
@@ -2302,17 +2310,34 @@ __be16 skb_network_protocol(struct sk_buff *skb)
type = eth->h_proto;
}
- while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
- struct vlan_hdr *vh;
+ /* if skb->protocol is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
- if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
- return 0;
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
- vh = (struct vlan_hdr *)(skb->data + vlan_depth);
- type = vh->h_vlan_encapsulated_proto;
- vlan_depth += VLAN_HLEN;
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
}
+ *depth = vlan_depth;
+
return type;
}
@@ -2326,12 +2351,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
- __be16 type = skb_network_protocol(skb);
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
if (unlikely(!type))
return ERR_PTR(-EINVAL);
- __skb_pull(skb, skb->mac_len);
+ __skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
@@ -2494,11 +2520,39 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
return 0;
}
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#ifdef CONFIG_NET_MPLS_GSO
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
+#endif
+
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
+ int tmp;
+ __be16 type;
+
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
+
if (skb->ip_summed != CHECKSUM_NONE &&
- !can_checksum_protocol(features, skb_network_protocol(skb))) {
+ !can_checksum_protocol(features, type)) {
features &= ~NETIF_F_ALL_CSUM;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
@@ -2803,7 +2857,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -2878,6 +2932,7 @@ recursion_alert:
rc = -ENETDOWN;
rcu_read_unlock_bh();
+ atomic_long_inc(&dev->tx_dropped);
kfree_skb(skb);
return rc;
out:
@@ -2950,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb->rxhash & flow_table->mask;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -2984,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u16 tcpu;
+ u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
@@ -3012,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
skb_reset_network_header(skb);
- if (!skb_get_hash(skb))
+ hash = skb_get_hash(skb);
+ if (!hash)
goto done;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
@@ -3021,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
u16 next_cpu;
struct rps_dev_flow *rflow;
- rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[skb->rxhash &
- sock_flow_table->mask];
+ next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
/*
* If the desired CPU (where last recvmsg was done) is
@@ -3054,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) hash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
@@ -3229,10 +3285,6 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
-
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
@@ -3343,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q);
@@ -3353,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)
&q->state)) {
__netif_reschedule(q);
} else {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
@@ -3439,7 +3491,7 @@ out:
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
- * Register a receive hander for a device. This handler will then be
+ * Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
@@ -3493,11 +3545,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
- case __constant_htons(ETH_P_ARP):
- case __constant_htons(ETH_P_IP):
- case __constant_htons(ETH_P_IPV6):
- case __constant_htons(ETH_P_8021Q):
- case __constant_htons(ETH_P_8021AD):
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
return true;
default:
return false;
@@ -3518,10 +3570,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
trace_netif_receive_skb(skb);
- /* if we've gotten here through NAPI, check netpoll */
- if (netpoll_receive_skb(skb))
- goto out;
-
orig_dev = skb->dev;
skb_reset_network_header(skb);
@@ -3648,7 +3696,6 @@ drop:
unlock:
rcu_read_unlock();
-out:
return ret;
}
@@ -3838,10 +3885,10 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
diffs |= p->vlan_tci ^ skb->vlan_tci;
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
- skb_gro_mac_header(skb));
+ skb_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
- skb_gro_mac_header(skb),
+ skb_mac_header(skb),
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
@@ -3864,6 +3911,27 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
}
}
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
@@ -3872,14 +3940,14 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
struct list_head *head = &offload_base;
int same_flow;
enum gro_result ret;
+ int grow;
- if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+ if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal;
- skb_gro_reset_offset(skb);
gro_list_prepare(napi, skb);
NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
@@ -3937,33 +4005,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
ret = GRO_HELD;
pull:
- if (skb_headlen(skb) < skb_gro_offset(skb)) {
- int grow = skb_gro_offset(skb) - skb_headlen(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->tail += grow;
- skb->data_len -= grow;
-
- skb_shinfo(skb)->frags[0].page_offset += grow;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(skb_shinfo(skb)->frags,
- skb_shinfo(skb)->frags + 1,
- --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
- }
- }
-
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
ok:
return ret;
@@ -4031,6 +4082,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
trace_napi_gro_receive_entry(skb);
+ skb_gro_reset_offset(skb);
+
return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
@@ -4043,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->vlan_tci = 0;
skb->dev = napi->dev;
skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
napi->skb = skb;
}
@@ -4059,12 +4115,16 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_get_frags);
-static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb_internal(skb))
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
@@ -4073,7 +4133,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
napi_reuse_skb(napi, skb);
break;
- case GRO_HELD:
case GRO_MERGED:
break;
}
@@ -4081,17 +4140,41 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
return ret;
}
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
- napi_reuse_skb(napi, skb);
- return NULL;
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
- skb->protocol = eth_type_trans(skb, skb->dev);
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
return skb;
}
@@ -4128,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
- __smp_call_function_single(remsd->cpu,
- &remsd->csd, 0);
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
remsd = next;
}
} else
@@ -4153,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
#endif
napi->weight = weight_p;
local_irq_disable();
- while (work < quota) {
+ while (1) {
struct sk_buff *skb;
- unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
@@ -4169,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
rps_lock(sd);
- qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen)
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
-
- if (qlen < quota - work) {
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
- * and NAPI_STATE_SCHED is the only possible flag set on backlog.
- * we can use a plain write instead of clear_bit(),
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
+ rps_unlock(sd);
- quota = work + qlen;
+ break;
}
+
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
rps_unlock(sd);
}
local_irq_enable();
@@ -4216,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)
BUG_ON(n->gro_list);
list_del(&n->poll_list);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
@@ -4515,6 +4597,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list)
EXPORT_SYMBOL(netdev_adjacent_get_private);
/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+/**
* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
* @dev: device
* @iter: list_head ** of the current position
@@ -4561,8 +4669,7 @@ void *netdev_lower_get_next_private(struct net_device *dev,
if (&lower->list == &dev->adj_list.lower)
return NULL;
- if (iter)
- *iter = lower->list.next;
+ *iter = lower->list.next;
return lower->private;
}
@@ -4590,14 +4697,39 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
if (&lower->list == &dev->adj_list.lower)
return NULL;
- if (iter)
- *iter = &lower->list;
+ *iter = &lower->list;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
@@ -4637,7 +4769,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
-int netdev_adjacent_sysfs_add(struct net_device *dev,
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list)
{
@@ -4647,7 +4779,7 @@ int netdev_adjacent_sysfs_add(struct net_device *dev,
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
linkname);
}
-void netdev_adjacent_sysfs_del(struct net_device *dev,
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
char *name,
struct list_head *dev_list)
{
@@ -5047,6 +5179,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
+{
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
+
+ ASSERT_RTNL();
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
+ }
+
+ if (type_check(dev))
+ max_nest++;
+
+ return max_nest;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -5516,7 +5672,7 @@ static int dev_new_index(struct net *net)
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
-static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
@@ -5573,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
-
/*
* Flush the unicast and multicast chains
*/
@@ -5586,6 +5738,10 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -5669,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
+
return features;
}
@@ -5804,10 +5967,7 @@ static void netdev_init_one_queue(struct net_device *dev,
static void netif_free_tx_queues(struct net_device *dev)
{
- if (is_vmalloc_addr(dev->_tx))
- vfree(dev->_tx);
- else
- kfree(dev->_tx);
+ kvfree(dev->_tx);
}
static int netif_alloc_netdev_queues(struct net_device *dev)
@@ -6244,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -6280,10 +6441,7 @@ void netdev_freemem(struct net_device *dev)
{
char *addr = (char *)dev - dev->padded;
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
+ kvfree(addr);
}
/**
@@ -6388,11 +6546,6 @@ free_all:
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- netif_free_tx_queues(dev);
-#ifdef CONFIG_SYSFS
- kfree(dev->_rx);
-#endif
-
free_dev:
netdev_freemem(dev);
return NULL;
@@ -6489,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -6498,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)
rollback_registered_many(head);
list_for_each_entry(dev, head, unreg_list)
net_set_todo(dev);
+ list_del(head);
}
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -6953,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
}
unregister_netdevice_many(&dev_kill_list);
- list_del(&dev_kill_list);
rtnl_unlock();
}
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 329d5794e7d..b6b230600b9 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -225,6 +225,91 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
}
EXPORT_SYMBOL(__hw_addr_unsync);
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchonized addresses from device
+ * @list: address list to remove syncronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
diff --git a/net/core/dst.c b/net/core/dst.c
index ca4231ec734..a028409ee43 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -142,12 +142,12 @@ loop:
mutex_unlock(&dst_gc_mutex);
}
-int dst_discard(struct sk_buff *skb)
+int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
-EXPORT_SYMBOL(dst_discard);
+EXPORT_SYMBOL(dst_discard_sk);
const u32 dst_default_metrics[RTAX_MAX + 1] = {
/* This initializer is needed to force linker to place this variable
@@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->xfrm = NULL;
#endif
dst->input = dst_discard;
- dst->output = dst_discard;
+ dst->output = dst_discard_sk;
dst->error = 0;
dst->obsolete = initial_obsolete;
dst->header_len = 0;
@@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst)
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
- dst->input = dst->output = dst_discard;
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ }
dst->obsolete = DST_OBSOLETE_DEAD;
}
@@ -267,6 +269,15 @@ again:
}
EXPORT_SYMBOL(dst_destroy);
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+}
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
@@ -274,11 +285,8 @@ void dst_release(struct dst_entry *dst)
newrefcnt = atomic_dec_return(&dst->__refcnt);
WARN_ON(newrefcnt < 0);
- if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
- dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
- }
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
@@ -361,7 +369,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
return;
if (!unregister) {
- dst->input = dst->output = dst_discard;
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 30071dec287..17cb912793f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXFCS_BIT] = "rx-fcs",
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
};
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
@@ -556,6 +557,23 @@ err_out:
return ret;
}
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -564,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
int ret;
if (!dev->ethtool_ops->get_rxfh_indir_size ||
- !dev->ethtool_ops->get_rxfh_indir)
+ !dev->ethtool_ops->get_rxfh)
return -EOPNOTSUPP;
dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
if (dev_size == 0)
@@ -590,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
if (!indir)
return -ENOMEM;
- ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
if (ret)
goto out;
@@ -612,8 +630,9 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
u32 *indir;
const struct ethtool_ops *ops = dev->ethtool_ops;
int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
- if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir ||
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
!ops->get_rxnfc)
return -EOPNOTSUPP;
@@ -642,28 +661,184 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
for (i = 0; i < dev_size; i++)
indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
} else {
- if (copy_from_user(indir,
- useraddr +
- offsetof(struct ethtool_rxfh_indir,
- ring_index[0]),
- dev_size * sizeof(indir[0]))) {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ops->set_rxfh(dev, indir, NULL);
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!(dev->ethtool_ops->get_rxfh_indir_size ||
+ dev->ethtool_ops->get_rxfh_key_size) ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size and key size. Otherwise, if the User size is
+ * not equal to device table size or key size it's an error.
+ */
+ if (!user_indir_size && !user_key_size)
+ return 0;
+
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
+ if (!ret) {
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size))
ret = -EFAULT;
+ }
+
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+ if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
+ !ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ /* If either indir or hash key is valid, proceed further.
+ * It is not valid to request that both be unchanged.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0))
+ return -EINVAL;
+
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ /* rxfh.indir_size == 0 means reset the indir table to default.
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
goto out;
- }
+ } else if (rxfh.indir_size == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ }
- /* Validate ring indices */
- for (i = 0; i < dev_size; i++) {
- if (indir[i] >= rx_rings.data) {
- ret = -EINVAL;
- goto out;
- }
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
}
}
- ret = ops->set_rxfh_indir(dev, indir);
+ ret = ops->set_rxfh(dev, indir, hkey);
out:
- kfree(indir);
+ kfree(rss_config);
return ret;
}
@@ -1490,6 +1665,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
case ETHTOOL_GFEATURES:
case ETHTOOL_GCHANNELS:
case ETHTOOL_GET_TS_INFO:
@@ -1627,6 +1803,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXFHINDIR:
rc = ethtool_set_rxfh_indir(dev, useraddr);
break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
case ETHTOOL_GFEATURES:
rc = ethtool_get_features(dev, useraddr);
break;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f409e0bd35c..185c341fafb 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -745,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,
attach_rules(&ops->rules_list, dev);
break;
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
+ break;
+
case NETDEV_UNREGISTER:
list_for_each_entry(ops, &net->rules_ops, list)
detach_rules(&ops->rules_list, dev);
diff --git a/net/core/filter.c b/net/core/filter.c
index ad30d626a5b..1dbf6462f76 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
/*
* Linux Socket Filter - Kernel level socket filtering
*
- * Author:
- * Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
*
- * Based on the design of:
- * - The Berkeley Packet Filter
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -40,6 +45,27 @@
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -52,9 +78,9 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
+
return NULL;
}
@@ -63,6 +89,7 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,
{
if (k >= 0)
return skb_header_pointer(skb, k, size, buffer);
+
return bpf_internal_load_pointer_neg_helper(skb, k, size);
}
@@ -108,304 +135,960 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sk_filter);
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
/**
- * sk_run_filter - run a filter on a socket
- * @skb: buffer to run the filter on
- * @fentry: filter to apply
+ * __sk_run_filter - run a filter on a given context
+ * @ctx: buffer to run the filter on
+ * @insn: filter to apply
*
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. @skb is the data we are
- * filtering, @filter is the array of filter instructions.
- * Because all jumps are guaranteed to be before last instruction,
- * and last instruction guaranteed to be a RET, we dont need to check
- * flen. (We used to pass to this function the length of filter)
+ * Decode and apply filter instructions to the skb->data. Return length to
+ * keep, 0 for none. @ctx is the data we are operating on, @insn is the
+ * array of filter instructions.
*/
-unsigned int sk_run_filter(const struct sk_buff *skb,
- const struct sock_filter *fentry)
+static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ };
void *ptr;
- u32 A = 0; /* Accumulator */
- u32 X = 0; /* Index Register */
- u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
- u32 tmp;
- int k;
+ int off;
- /*
- * Process array of filter instructions.
- */
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
-#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
- switch (fentry->code) {
- case BPF_S_ALU_ADD_X:
- A += X;
- continue;
- case BPF_S_ALU_ADD_K:
- A += K;
- continue;
- case BPF_S_ALU_SUB_X:
- A -= X;
- continue;
- case BPF_S_ALU_SUB_K:
- A -= K;
- continue;
- case BPF_S_ALU_MUL_X:
- A *= X;
- continue;
- case BPF_S_ALU_MUL_K:
- A *= K;
- continue;
- case BPF_S_ALU_DIV_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_S_ALU_DIV_K:
- A /= K;
- continue;
- case BPF_S_ALU_MOD_X:
- if (X == 0)
- return 0;
- A %= X;
- continue;
- case BPF_S_ALU_MOD_K:
- A %= K;
- continue;
- case BPF_S_ALU_AND_X:
- A &= X;
- continue;
- case BPF_S_ALU_AND_K:
- A &= K;
- continue;
- case BPF_S_ALU_OR_X:
- A |= X;
- continue;
- case BPF_S_ALU_OR_K:
- A |= K;
- continue;
- case BPF_S_ANC_ALU_XOR_X:
- case BPF_S_ALU_XOR_X:
- A ^= X;
- continue;
- case BPF_S_ALU_XOR_K:
- A ^= K;
- continue;
- case BPF_S_ALU_LSH_X:
- A <<= X;
- continue;
- case BPF_S_ALU_LSH_K:
- A <<= K;
- continue;
- case BPF_S_ALU_RSH_X:
- A >>= X;
- continue;
- case BPF_S_ALU_RSH_K:
- A >>= K;
- continue;
- case BPF_S_ALU_NEG:
- A = -A;
- continue;
- case BPF_S_JMP_JA:
- fentry += K;
- continue;
- case BPF_S_JMP_JGT_K:
- fentry += (A > K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_K:
- fentry += (A >= K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_K:
- fentry += (A == K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_K:
- fentry += (A & K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGT_X:
- fentry += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_X:
- fentry += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_X:
- fentry += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_X:
- fentry += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_LD_W_ABS:
- k = K;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be32(ptr);
- continue;
- }
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
return 0;
- case BPF_S_LD_H_ABS:
- k = K;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be16(ptr);
- continue;
- }
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
return 0;
- case BPF_S_LD_B_ABS:
- k = K;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
- }
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
return 0;
- case BPF_S_LD_W_LEN:
- A = skb->len;
- continue;
- case BPF_S_LDX_W_LEN:
- X = skb->len;
- continue;
- case BPF_S_LD_W_IND:
- k = X + K;
- goto load_w;
- case BPF_S_LD_H_IND:
- k = X + K;
- goto load_h;
- case BPF_S_LD_B_IND:
- k = X + K;
- goto load_b;
- case BPF_S_LDX_B_MSH:
- ptr = load_pointer(skb, K, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
- }
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
return 0;
- case BPF_S_LD_IMM:
- A = K;
- continue;
- case BPF_S_LDX_IMM:
- X = K;
- continue;
- case BPF_S_LD_MEM:
- A = mem[K];
- continue;
- case BPF_S_LDX_MEM:
- X = mem[K];
- continue;
- case BPF_S_MISC_TAX:
- X = A;
- continue;
- case BPF_S_MISC_TXA:
- A = X;
- continue;
- case BPF_S_RET_K:
- return K;
- case BPF_S_RET_A:
- return A;
- case BPF_S_ST:
- mem[K] = A;
- continue;
- case BPF_S_STX:
- mem[K] = X;
- continue;
- case BPF_S_ANC_PROTOCOL:
- A = ntohs(skb->protocol);
- continue;
- case BPF_S_ANC_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case BPF_S_ANC_IFINDEX:
- if (!skb->dev)
- return 0;
- A = skb->dev->ifindex;
- continue;
- case BPF_S_ANC_MARK:
- A = skb->mark;
- continue;
- case BPF_S_ANC_QUEUE:
- A = skb->queue_mapping;
- continue;
- case BPF_S_ANC_HATYPE:
- if (!skb->dev)
- return 0;
- A = skb->dev->type;
- continue;
- case BPF_S_ANC_RXHASH:
- A = skb->rxhash;
- continue;
- case BPF_S_ANC_CPU:
- A = raw_smp_processor_id();
- continue;
- case BPF_S_ANC_VLAN_TAG:
- A = vlan_tx_tag_get(skb);
- continue;
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- A = !!vlan_tx_tag_present(skb);
- continue;
- case BPF_S_ANC_PAY_OFFSET:
- A = __skb_get_poff(skb);
- continue;
- case BPF_S_ANC_NLATTR: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = nla_find((struct nlattr *)&skb->data[A],
- skb->len - A, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
}
- case BPF_S_ANC_NLATTR_NEST: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = (struct nlattr *)&skb->data[A];
- if (nla->nla_len > A - skb->len)
- return 0;
-
- nla = nla_find_nested(nla, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
}
-#ifdef CONFIG_SECCOMP_FILTER
- case BPF_S_ANC_SECCOMP_LD_W:
- A = seccomp_bpf_load(fentry->k);
- continue;
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+/* Helper to find the offset of pkt_type in sk_buff structure. We want
+ * to make sure its still a 3bit field starting at a byte boundary;
+ * taken from arch/x86/net/bpf_jit_comp.c.
+ */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX (7 << 5)
+#else
+#define PKT_TYPE_MAX 7
+#endif
+static unsigned int pkt_type_offset(void)
+{
+ struct sk_buff skb_probe = { .pkt_type = ~0, };
+ u8 *ct = (u8 *) &skb_probe;
+ unsigned int off;
+
+ for (off = 0; off < sizeof(struct sk_buff); off++) {
+ if (ct[off] == PKT_TYPE_MAX)
+ return off;
+ }
+
+ pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
+ return -1;
+}
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct sock_filter_int **insnp)
+{
+ struct sock_filter_int *insn = *insnp;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ pkt_type_offset());
+ if (insn->off < 0)
+ return false;
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
#endif
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_tci));
+ if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* A >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
+ /* A &= 1 */
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(__get_random_u32);
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * sk_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: buffer where converted program will be stored
+ * @new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
+ * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
+ */
+int sk_convert_filter(struct sock_filter *prog, int len,
+ struct sock_filter_int *new_prog, int *new_len)
+{
+ int new_flen = 0, pass = 0, target, i;
+ struct sock_filter_int *new_insn;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = new_prog;
+ fp = prog;
+
+ if (new_insn)
+ *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ new_insn++;
+
+ for (i = 0; i < len; fp++, i++) {
+ struct sock_filter_int tmp_insns[6] = { };
+ struct sock_filter_int *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - new_prog;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ if (target >= len || target < 0) \
+ goto err; \
+ insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ insn->off -= insn - tmp_insns; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_X;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
+ }
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert JEQ into JNE when 'jump_true' is next insn. */
+ if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B:
+ /* tmp = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+
+ /* RET_K, RET_A are remaped into 2 insns. */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+ BPF_K : BPF_X, BPF_REG_0,
+ BPF_REG_A, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
+ case BPF_ST:
+ case BPF_STX:
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unkown instruction. */
default:
- WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
- fentry->code, fentry->jt,
- fentry->jf, fentry->k);
- return 0;
+ goto err;
}
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - new_prog;
+ return 0;
}
+ pass++;
+ if (new_flen != new_insn - new_prog) {
+ new_flen = new_insn - new_prog;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
+ }
+
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
}
-EXPORT_SYMBOL(sk_run_filter);
-/*
- * Security :
+/* Security:
+ *
* A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ *
* As we dont want to clear mem[] array for each packet going through
* sk_run_filter(), we check that filter loaded by user never try to read
* a cell if not previously written, and we check all branches to be sure
@@ -413,44 +1096,46 @@ EXPORT_SYMBOL(sk_run_filter);
*/
static int check_load_and_stores(struct sock_filter *filter, int flen)
{
- u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
int pc, ret = 0;
BUILD_BUG_ON(BPF_MEMWORDS > 16);
- masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
if (!masks)
return -ENOMEM;
+
memset(masks, 0xff, flen * sizeof(*masks));
for (pc = 0; pc < flen; pc++) {
memvalid &= masks[pc];
switch (filter[pc].code) {
- case BPF_S_ST:
- case BPF_S_STX:
+ case BPF_ST:
+ case BPF_STX:
memvalid |= (1 << filter[pc].k);
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
if (!(memvalid & (1 << filter[pc].k))) {
ret = -EINVAL;
goto error;
}
break;
- case BPF_S_JMP_JA:
- /* a jump must set masks on target */
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
masks[pc + 1 + filter[pc].k] &= memvalid;
memvalid = ~0;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* a jump must set masks on targets */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
masks[pc + 1 + filter[pc].jt] &= memvalid;
masks[pc + 1 + filter[pc].jf] &= memvalid;
memvalid = ~0;
@@ -462,6 +1147,72 @@ error:
return ret;
}
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
/**
* sk_chk_filter - verify socket filter code
* @filter: filter to verify
@@ -478,187 +1229,303 @@ error:
*/
int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
{
- /*
- * Valid instructions are initialized to non-0.
- * Invalid instructions are initialized to 0.
- */
- static const u8 codes[] = {
- [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K,
- [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X,
- [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K,
- [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X,
- [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
- [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
- [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
- [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K,
- [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X,
- [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
- [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
- [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
- [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
- [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K,
- [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X,
- [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
- [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
- [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
- [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X,
- [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG,
- [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS,
- [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS,
- [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS,
- [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN,
- [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND,
- [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND,
- [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND,
- [BPF_LD|BPF_IMM] = BPF_S_LD_IMM,
- [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN,
- [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH,
- [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM,
- [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX,
- [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA,
- [BPF_RET|BPF_K] = BPF_S_RET_K,
- [BPF_RET|BPF_A] = BPF_S_RET_A,
- [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K,
- [BPF_LD|BPF_MEM] = BPF_S_LD_MEM,
- [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM,
- [BPF_ST] = BPF_S_ST,
- [BPF_STX] = BPF_S_STX,
- [BPF_JMP|BPF_JA] = BPF_S_JMP_JA,
- [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K,
- [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X,
- [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K,
- [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X,
- [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K,
- [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X,
- [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
- [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
- };
- int pc;
bool anc_found;
+ int pc;
if (flen == 0 || flen > BPF_MAXINSNS)
return -EINVAL;
- /* check the filter code now */
+ /* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
struct sock_filter *ftest = &filter[pc];
- u16 code = ftest->code;
- if (code >= ARRAY_SIZE(codes))
- return -EINVAL;
- code = codes[code];
- if (!code)
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
return -EINVAL;
+
/* Some instructions need special checks */
- switch (code) {
- case BPF_S_ALU_DIV_K:
- case BPF_S_ALU_MOD_K:
- /* check for division by zero */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
if (ftest->k == 0)
return -EINVAL;
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
- case BPF_S_ST:
- case BPF_S_STX:
- /* check for invalid memory addresses */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* Check for invalid memory addresses */
if (ftest->k >= BPF_MEMWORDS)
return -EINVAL;
break;
- case BPF_S_JMP_JA:
- /*
- * Note, the large ftest->k might cause loops.
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
* Compare this with conditional jumps below,
* where offsets are limited. --ANK (981016)
*/
- if (ftest->k >= (unsigned int)(flen-pc-1))
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
return -EINVAL;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* for conditionals both must be safe */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
if (pc + ftest->jt + 1 >= flen ||
pc + ftest->jf + 1 >= flen)
return -EINVAL;
break;
- case BPF_S_LD_W_ABS:
- case BPF_S_LD_H_ABS:
- case BPF_S_LD_B_ABS:
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
anc_found = false;
-#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \
- code = BPF_S_ANC_##CODE; \
- anc_found = true; \
- break
- switch (ftest->k) {
- ANCILLARY(PROTOCOL);
- ANCILLARY(PKTTYPE);
- ANCILLARY(IFINDEX);
- ANCILLARY(NLATTR);
- ANCILLARY(NLATTR_NEST);
- ANCILLARY(MARK);
- ANCILLARY(QUEUE);
- ANCILLARY(HATYPE);
- ANCILLARY(RXHASH);
- ANCILLARY(CPU);
- ANCILLARY(ALU_XOR_X);
- ANCILLARY(VLAN_TAG);
- ANCILLARY(VLAN_TAG_PRESENT);
- ANCILLARY(PAY_OFFSET);
- }
-
- /* ancillary operation unknown or unsupported */
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
if (anc_found == false && ftest->k >= SKF_AD_OFF)
return -EINVAL;
}
- ftest->code = code;
}
- /* last instruction must be a RET code */
+ /* Last instruction must be a RET code */
switch (filter[flen - 1].code) {
- case BPF_S_RET_K:
- case BPF_S_RET_A:
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
return check_load_and_stores(filter, flen);
}
+
return -EINVAL;
}
EXPORT_SYMBOL(sk_chk_filter);
+static int sk_store_orig_filter(struct sk_filter *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = sk_filter_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+ fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void sk_release_orig_filter(struct sk_filter *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
/**
* sk_filter_release_rcu - Release a socket filter by rcu_head
* @rcu: rcu_head that contains the sk_filter to free
*/
-void sk_filter_release_rcu(struct rcu_head *rcu)
+static void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+ sk_release_orig_filter(fp);
+ sk_filter_free(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (atomic_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_inc(&fp->refcnt);
+ atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+}
+
+static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
+ struct sock *sk,
+ unsigned int len)
+{
+ struct sk_filter *fp_new;
+
+ if (sk == NULL)
+ return krealloc(fp, len, GFP_KERNEL);
+
+ fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
+ if (fp_new) {
+ *fp_new = *fp;
+ /* As we're keeping orig_prog in fp_new along,
+ * we need to make sure we're not evicting it
+ * from the old fp.
+ */
+ fp->orig_prog = NULL;
+ sk_filter_uncharge(sk, fp);
+ }
+
+ return fp_new;
+}
+
+static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
+ struct sock *sk)
+{
+ struct sock_filter *old_prog;
+ struct sk_filter *old_fp;
+ int err, new_len, old_len = fp->len;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct sock_filter_int));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
+ err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ if (err)
+ /* 2nd sk_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by __sk_migrate_realloc().
+ */
+ goto out_err_free;
+
+ sk_filter_select_runtime(fp);
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ /* Rollback filter setup. */
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+}
+
+void __weak bpf_int_jit_compile(struct sk_filter *prog)
+{
+}
+
+/**
+ * sk_filter_select_runtime - select execution runtime for BPF program
+ * @fp: sk_filter populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via SK_RUN_FILTER() macro
+ */
+void sk_filter_select_runtime(struct sk_filter *fp)
+{
+ fp->bpf_func = (void *) __sk_run_filter;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
+
+/* free internal BPF program */
+void sk_filter_free(struct sk_filter *fp)
+{
bpf_jit_free(fp);
}
-EXPORT_SYMBOL(sk_filter_release_rcu);
+EXPORT_SYMBOL_GPL(sk_filter_free);
-static int __sk_prepare_filter(struct sk_filter *fp)
+static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
+ struct sock *sk)
{
int err;
- fp->bpf_func = sk_run_filter;
+ fp->bpf_func = NULL;
+ fp->jited = 0;
err = sk_chk_filter(fp->insns, fp->len);
- if (err)
- return err;
+ if (err) {
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+ }
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
bpf_jit_compile(fp);
- return 0;
+
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = __sk_migrate_filter(fp, sk);
+
+ return fp;
}
/**
* sk_unattached_filter_create - create an unattached filter
- * @fprog: the filter program
* @pfp: the unattached filter that is created
+ * @fprog: the filter program
*
* Create a filter independent of any socket. We first run some
* sanity checks on it to make sure it does not explode on us later.
@@ -666,11 +1533,10 @@ static int __sk_prepare_filter(struct sk_filter *fp)
* a negative errno code is returned. On success the return is zero.
*/
int sk_unattached_filter_create(struct sk_filter **pfp,
- struct sock_fprog *fprog)
+ struct sock_fprog_kern *fprog)
{
+ unsigned int fsize = sk_filter_proglen(fprog);
struct sk_filter *fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
- int err;
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
@@ -679,20 +1545,26 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
if (!fp)
return -ENOMEM;
+
memcpy(fp->insns, fprog->filter, fsize);
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
- err = __sk_prepare_filter(fp);
- if (err)
- goto free_mem;
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, NULL);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
*pfp = fp;
return 0;
-free_mem:
- kfree(fp);
- return err;
}
EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
@@ -715,7 +1587,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ unsigned int fsize = sk_filter_proglen(fprog);
unsigned int sk_fsize = sk_filter_size(fprog->len);
int err;
@@ -729,6 +1601,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
if (!fp)
return -ENOMEM;
+
if (copy_from_user(fp->insns, fprog->filter, fsize)) {
sock_kfree_s(sk, fp, sk_fsize);
return -EFAULT;
@@ -737,18 +1610,26 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
- err = __sk_prepare_filter(fp);
+ err = sk_store_orig_filter(fp, fprog);
if (err) {
sk_filter_uncharge(sk, fp);
- return err;
+ return -ENOMEM;
}
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, sk);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
old_fp = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
rcu_assign_pointer(sk->sk_filter, fp);
if (old_fp)
sk_filter_uncharge(sk, old_fp);
+
return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -768,116 +1649,46 @@ int sk_detach_filter(struct sock *sk)
sk_filter_uncharge(sk, filter);
ret = 0;
}
+
return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
-{
- static const u16 decodes[] = {
- [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K,
- [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X,
- [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K,
- [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X,
- [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K,
- [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X,
- [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X,
- [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K,
- [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X,
- [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K,
- [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X,
- [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K,
- [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X,
- [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K,
- [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X,
- [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K,
- [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X,
- [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K,
- [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X,
- [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG,
- [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS,
- [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS,
- [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN,
- [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND,
- [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND,
- [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND,
- [BPF_S_LD_IMM] = BPF_LD|BPF_IMM,
- [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN,
- [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH,
- [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM,
- [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX,
- [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA,
- [BPF_S_RET_K] = BPF_RET|BPF_K,
- [BPF_S_RET_A] = BPF_RET|BPF_A,
- [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K,
- [BPF_S_LD_MEM] = BPF_LD|BPF_MEM,
- [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM,
- [BPF_S_ST] = BPF_ST,
- [BPF_S_STX] = BPF_STX,
- [BPF_S_JMP_JA] = BPF_JMP|BPF_JA,
- [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K,
- [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X,
- [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K,
- [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X,
- [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K,
- [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X,
- [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K,
- [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X,
- };
- u16 code;
-
- code = filt->code;
-
- to->code = decodes[code];
- to->jt = filt->jt;
- to->jf = filt->jf;
- to->k = filt->k;
-}
-
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len)
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
{
+ struct sock_fprog_kern *fprog;
struct sk_filter *filter;
- int i, ret;
+ int ret = 0;
lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
- ret = 0;
+ sock_owned_by_user(sk));
if (!filter)
goto out;
- ret = filter->len;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore.
+ */
+ fprog = filter->orig_prog;
+
+ ret = fprog->len;
if (!len)
+ /* User space only enquires number of filter blocks. */
goto out;
+
ret = -EINVAL;
- if (len < filter->len)
+ if (len < fprog->len)
goto out;
ret = -EFAULT;
- for (i = 0; i < filter->len; i++) {
- struct sock_filter fb;
-
- sk_decode_filter(&filter->insns[i], &fb);
- if (copy_to_user(&ubuf[i], &fb, sizeof(fb)))
- goto out;
- }
+ if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
+ goto out;
- ret = filter->len;
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
out:
release_sock(sk);
return ret;
diff --git a/net/core/flow.c b/net/core/flow.c
index dfa602ceb8c..a0348fde1fd 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -24,6 +24,7 @@
#include <net/flow.h>
#include <linux/atomic.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
struct flow_cache_entry {
union {
@@ -38,37 +39,14 @@ struct flow_cache_entry {
struct flow_cache_object *object;
};
-struct flow_cache_percpu {
- struct hlist_head *hash_table;
- int hash_count;
- u32 hash_rnd;
- int hash_rnd_recalc;
- struct tasklet_struct flush_tasklet;
-};
-
struct flow_flush_info {
struct flow_cache *cache;
atomic_t cpuleft;
struct completion completion;
};
-struct flow_cache {
- u32 hash_shift;
- struct flow_cache_percpu __percpu *percpu;
- struct notifier_block hotcpu_notifier;
- int low_watermark;
- int high_watermark;
- struct timer_list rnd_timer;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-EXPORT_SYMBOL(flow_cache_genid);
-static struct flow_cache flow_cache_global;
static struct kmem_cache *flow_cachep __read_mostly;
-static DEFINE_SPINLOCK(flow_cache_gc_lock);
-static LIST_HEAD(flow_cache_gc_list);
-
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
@@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)
add_timer(&fc->rnd_timer);
}
-static int flow_entry_valid(struct flow_cache_entry *fle)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
- if (atomic_read(&flow_cache_genid) != fle->genid)
+ if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
return 0;
if (fle->object && !fle->object->ops->check(fle->object))
return 0;
return 1;
}
-static void flow_entry_kill(struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
if (fle->object)
fle->object->ops->delete(fle->object);
@@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)
{
struct list_head gc_list;
struct flow_cache_entry *fce, *n;
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
INIT_LIST_HEAD(&gc_list);
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail_init(&flow_cache_gc_list, &gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
- flow_entry_kill(fce);
+ flow_entry_kill(fce, xfrm);
}
-static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
- int deleted, struct list_head *gc_list)
+ int deleted, struct list_head *gc_list,
+ struct netns_xfrm *xfrm)
{
if (deleted) {
fcp->hash_count -= deleted;
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail(gc_list, &flow_cache_gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
- schedule_work(&flow_cache_gc_work);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+ schedule_work(&xfrm->flow_cache_gc_work);
}
}
@@ -135,6 +117,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
int saved = 0;
@@ -142,7 +126,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
if (saved < shrink_to &&
- flow_entry_valid(fle)) {
+ flow_entry_valid(fle, xfrm)) {
saved++;
} else {
deleted++;
@@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
}
static void flow_cache_shrink(struct flow_cache *fc,
@@ -208,7 +192,7 @@ struct flow_cache_object *
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver, void *ctx)
{
- struct flow_cache *fc = &flow_cache_global;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle, *tfle;
struct flow_cache_object *flo;
@@ -258,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
fcp->hash_count++;
}
- } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+ } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
flo = fle->object;
if (!flo)
goto ret_object;
@@ -279,7 +263,7 @@ nocache:
}
flo = resolver(net, key, family, dir, flo, ctx);
if (fle) {
- fle->genid = atomic_read(&flow_cache_genid);
+ fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
if (!IS_ERR(flo))
fle->object = flo;
else
@@ -303,12 +287,14 @@ static void flow_cache_flush_tasklet(unsigned long data)
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
fcp = this_cpu_ptr(fc->percpu);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
- if (flow_entry_valid(fle))
+ if (flow_entry_valid(fle, xfrm))
continue;
deleted++;
@@ -317,7 +303,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
@@ -351,10 +337,9 @@ static void flow_cache_flush_per_cpu(void *data)
tasklet_schedule(tasklet);
}
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
{
struct flow_flush_info info;
- static DEFINE_MUTEX(flow_flush_sem);
cpumask_var_t mask;
int i, self;
@@ -365,8 +350,8 @@ void flow_cache_flush(void)
/* Don't want cpus going down or up during this. */
get_online_cpus();
- mutex_lock(&flow_flush_sem);
- info.cache = &flow_cache_global;
+ mutex_lock(&net->xfrm.flow_flush_sem);
+ info.cache = &net->xfrm.flow_cache_global;
for_each_online_cpu(i)
if (!flow_cache_percpu_empty(info.cache, i))
cpumask_set_cpu(i, mask);
@@ -386,21 +371,23 @@ void flow_cache_flush(void)
wait_for_completion(&info.completion);
done:
- mutex_unlock(&flow_flush_sem);
+ mutex_unlock(&net->xfrm.flow_flush_sem);
put_online_cpus();
free_cpumask_var(mask);
}
static void flow_cache_flush_task(struct work_struct *work)
{
- flow_cache_flush();
-}
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+ struct net *net = container_of(xfrm, struct net, xfrm);
-static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+ flow_cache_flush(net);
+}
-void flow_cache_flush_deferred(void)
+void flow_cache_flush_deferred(struct net *net)
{
- schedule_work(&flow_cache_flush_work);
+ schedule_work(&net->xfrm.flow_cache_flush_work);
}
static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
@@ -425,7 +412,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+ struct flow_cache *fc = container_of(nfb, struct flow_cache,
+ hotcpu_notifier);
int res, cpu = (unsigned long) hcpu;
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
@@ -444,9 +432,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static int __init flow_cache_init(struct flow_cache *fc)
+int flow_cache_init(struct net *net)
{
int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+ if (!flow_cachep)
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC, NULL);
+ spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+ INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+ INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+ INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+ mutex_init(&net->xfrm.flow_flush_sem);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -456,6 +455,8 @@ static int __init flow_cache_init(struct flow_cache *fc)
if (!fc->percpu)
return -ENOMEM;
+ cpu_notifier_register_begin();
+
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
goto err;
@@ -463,7 +464,9 @@ static int __init flow_cache_init(struct flow_cache *fc)
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
- register_hotcpu_notifier(&fc->hotcpu_notifier);
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpu_notifier_register_done();
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -479,19 +482,30 @@ err:
fcp->hash_table = NULL;
}
+ cpu_notifier_register_done();
+
free_percpu(fc->percpu);
fc->percpu = NULL;
return -ENOMEM;
}
+EXPORT_SYMBOL(flow_cache_init);
-static int __init flow_cache_init_global(void)
+void flow_cache_fini(struct net *net)
{
- flow_cachep = kmem_cache_create("flow_cache",
- sizeof(struct flow_cache_entry),
- 0, SLAB_PANIC, NULL);
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
- return flow_cache_init(&flow_cache_global);
-}
+ del_timer_sync(&fc->rnd_timer);
+ unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
-module_init(flow_cache_init_global);
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+}
+EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 87577d44755..107ed12a532 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -61,7 +61,7 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
again:
switch (proto) {
- case __constant_htons(ETH_P_IP): {
+ case htons(ETH_P_IP): {
const struct iphdr *iph;
struct iphdr _iph;
ip:
@@ -77,7 +77,7 @@ ip:
iph_to_flow_copy_addrs(flow, iph);
break;
}
- case __constant_htons(ETH_P_IPV6): {
+ case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
ipv6:
@@ -91,8 +91,8 @@ ipv6:
nhoff += sizeof(struct ipv6hdr);
break;
}
- case __constant_htons(ETH_P_8021AD):
- case __constant_htons(ETH_P_8021Q): {
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
@@ -104,7 +104,7 @@ ipv6:
nhoff += sizeof(*vlan);
goto again;
}
- case __constant_htons(ETH_P_PPP_SES): {
+ case htons(ETH_P_PPP_SES): {
struct {
struct pppoe_hdr hdr;
__be16 proto;
@@ -115,9 +115,9 @@ ipv6:
proto = hdr->proto;
nhoff += PPPOE_SES_HLEN;
switch (proto) {
- case __constant_htons(PPP_IP):
+ case htons(PPP_IP):
goto ip;
- case __constant_htons(PPP_IPV6):
+ case htons(PPP_IPV6):
goto ipv6;
default:
return false;
@@ -203,8 +203,8 @@ static __always_inline u32 __flow_hash_1word(u32 a)
/*
* __skb_get_hash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
- * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
void __skb_get_hash(struct sk_buff *skb)
@@ -216,7 +216,7 @@ void __skb_get_hash(struct sk_buff *skb)
return;
if (keys.ports)
- skb->l4_rxhash = 1;
+ skb->l4_hash = 1;
/* get a consistent hash (same value on both flow directions) */
if (((__force u32)keys.dst < (__force u32)keys.src) ||
@@ -232,7 +232,7 @@ void __skb_get_hash(struct sk_buff *skb)
if (!hash)
hash = 1;
- skb->rxhash = hash;
+ skb->hash = hash;
}
EXPORT_SYMBOL(__skb_get_hash);
@@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb)
return poff;
}
-static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
-{
- if (unlikely(queue_index >= dev->real_num_tx_queues)) {
- net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
- dev->name, queue_index,
- dev->real_num_tx_queues);
- return 0;
- }
- return queue_index;
-}
-
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
@@ -355,7 +344,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^
- skb->rxhash;
+ skb->hash;
hash = __flow_hash_1word(hash);
queue_index = map->queues[
((u64)hash * map->len) >> 32];
@@ -372,7 +361,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
#endif
}
-u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
@@ -392,7 +381,6 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
return queue_index;
}
-EXPORT_SYMBOL(__netdev_pick_tx);
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
@@ -403,13 +391,13 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
if (dev->real_num_tx_queues != 1) {
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb,
- accel_priv);
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
else
queue_index = __netdev_pick_tx(dev, skb);
if (!accel_priv)
- queue_index = dev_cap_txqueue(dev, queue_index);
+ queue_index = netdev_cap_txqueue(dev, queue_index);
}
skb_set_queue_mapping(skb, queue_index);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index b61869429f4..e1ec45ab1e6 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
{
int size, ct, err;
- if (m->msg_namelen) {
+ if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
void __user *namep;
namep = (void __user __force *) m->msg_name;
@@ -48,10 +48,10 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
if (err < 0)
return err;
}
- if (m->msg_name)
- m->msg_name = address;
+ m->msg_name = address;
} else {
m->msg_name = NULL;
+ m->msg_namelen = 0;
}
size = m->msg_iovlen * sizeof(struct iovec);
@@ -75,61 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
}
/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
- int offset, int len)
-{
- int copy;
- for (; len > 0; ++iov) {
- /* Skip over the finished iovecs */
- if (unlikely(offset >= iov->iov_len)) {
- offset -= iov->iov_len;
- continue;
- }
- copy = min_t(unsigned int, iov->iov_len - offset, len);
- if (copy_to_user(iov->iov_base + offset, kdata, copy))
- return -EFAULT;
- offset = 0;
- kdata += copy;
- len -= copy;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
-/*
- * Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
- int offset, int len)
-{
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
- if (copy_from_user(kdata, base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov++;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-
-/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 9c3a839322b..bd0767e6b2b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev)
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index b9e9e0d3867..ef31fef25e5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -766,9 +766,6 @@ static void neigh_periodic_work(struct work_struct *work)
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
- goto out;
-
/*
* periodically recompute ReachableTime from random function
*/
@@ -781,6 +778,9 @@ static void neigh_periodic_work(struct work_struct *work)
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
+ if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ goto out;
+
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
@@ -836,10 +836,10 @@ out:
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE) ?
- NEIGH_VAR(p, UCAST_PROBES) :
- NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
- NEIGH_VAR(p, MCAST_PROBES);
+ int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
+ if (!(n->nud_state & NUD_PROBE))
+ max_probes += NEIGH_VAR(p, MCAST_PROBES);
+ return max_probes;
}
static void neigh_invalidate(struct neighbour *neigh)
@@ -945,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh_invalidate(neigh);
+ goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
@@ -1247,8 +1248,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->updated = jiffies;
if (!(neigh->nud_state & NUD_FAILED))
return;
- neigh->nud_state = NUD_PROBE;
- atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES));
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
}
@@ -2248,7 +2249,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
- ndm->ndm_type = NDA_DST;
+ ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev->ifindex;
ndm->ndm_state = NUD_NONE;
@@ -3046,7 +3047,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (!t)
goto err;
- for (i = 0; i < ARRAY_SIZE(t->neigh_vars); i++) {
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
t->neigh_vars[i].data += (long) p;
t->neigh_vars[i].extra1 = dev;
t->neigh_vars[i].extra2 = p;
@@ -3058,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
+ struct neigh_table *tbl = p->tbl;
dev_name_source = "default";
- t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
- t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
- t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
- t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
if (handler) {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 93886246a0b..1cac29ebb05 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -104,6 +104,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
NETDEVICE_SHOW_RO(addr_len, fmt_dec);
NETDEVICE_SHOW_RO(iflink, fmt_dec);
@@ -252,6 +253,16 @@ static ssize_t operstate_show(struct device *dev,
}
static DEVICE_ATTR_RO(operstate);
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_changes));
+}
+static DEVICE_ATTR_RO(carrier_changes);
+
/* read-write attributes */
static int change_mtu(struct net_device *net, unsigned long new_mtu)
@@ -373,6 +384,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
&dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
&dev_attr_iflink.attr,
&dev_attr_ifindex.attr,
&dev_attr_addr_assign_type.attr,
@@ -384,6 +396,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_duplex.attr,
&dev_attr_dormant.attr,
&dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
&dev_attr_ifalias.attr,
&dev_attr_carrier.attr,
&dev_attr_mtu.attr,
@@ -789,7 +802,7 @@ exit:
kobject_put(kobj);
return error;
}
-#endif /* CONFIG_SYFS */
+#endif /* CONFIG_SYSFS */
int
net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
@@ -996,15 +1009,12 @@ static struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
- int i;
-
- for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
- break;
+ unsigned int i;
+ i = queue - dev->_tx;
BUG_ON(i >= dev->num_tx_queues);
return i;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 81d3a9a0845..85b62691f4f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -24,7 +24,7 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-static DEFINE_MUTEX(net_mutex);
+DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -273,7 +273,7 @@ static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp;
- LIST_HEAD(net_kill_list);
+ struct list_head net_kill_list;
LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd54166..30d903b19c6 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_subsys_id));
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
struct cgroup_cls_state *cs = css_cls_state(css);
- struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
if (parent)
cs->classid = parent->classid;
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
void *v = (void *)(unsigned long)cs->classid;
struct task_struct *p;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
+struct cgroup_subsys net_cls_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = cgrp_attach,
- .subsys_id = net_cls_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
-
-static int __init init_netclassid_cgroup(void)
-{
- return cgroup_load_subsys(&net_cls_subsys);
-}
-__initcall(init_netclassid_cgroup);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c03f3dec476..e33937fb32a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -46,13 +46,9 @@
static struct sk_buff_head skb_pool;
-static atomic_t trapped;
-
DEFINE_STATIC_SRCU(netpoll_srcu);
#define USEC_PER_POLL 50
-#define NETPOLL_RX_ENABLED 1
-#define NETPOLL_RX_DROP 2
#define MAX_SKB_SIZE \
(sizeof(struct ethhdr) + \
@@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
MAX_UDP_CHUNK)
static void zap_completion_queue(void);
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo);
static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
@@ -74,6 +69,37 @@ module_param(carrier_timeout, uint, 0644);
#define np_notice(np, fmt, ...) \
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ skb->vlan_tci = 0;
+ }
+
+ status = ops->ndo_start_xmit(skb, dev);
+ if (status == NETDEV_TX_OK)
+ txq_trans_update(txq);
+
+out:
+ return status;
+}
+
static void queue_process(struct work_struct *work)
{
struct netpoll_info *npinfo =
@@ -83,51 +109,31 @@ static void queue_process(struct work_struct *work)
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
- const struct net_device_ops *ops = dev->netdev_ops;
struct netdev_queue *txq;
if (!netif_device_present(dev) || !netif_running(dev)) {
- __kfree_skb(skb);
+ kfree_skb(skb);
continue;
}
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
local_irq_save(flags);
- __netif_tx_lock(txq, smp_processor_id());
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
- ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
schedule_delayed_work(&npinfo->tx_work, HZ/10);
return;
}
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
}
}
-static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, __be32 saddr, __be32 daddr)
-{
- __wsum psum;
-
- if (uh->check == 0 || skb_csum_unnecessary(skb))
- return 0;
-
- psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE &&
- !csum_fold(csum_add(psum, skb->csum)))
- return 0;
-
- skb->csum = psum;
-
- return __skb_checksum_complete(skb);
-}
-
/*
* Check whether delayed processing was scheduled for our NIC. If so,
* we attempt to grab the poll lock and use ->poll() to pump the card.
@@ -138,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
* trylock here and interrupts are already disabled in the softirq
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
*/
-static int poll_one_napi(struct netpoll_info *npinfo,
- struct napi_struct *napi, int budget)
+static int poll_one_napi(struct napi_struct *napi, int budget)
{
int work;
@@ -156,52 +156,35 @@ static int poll_one_napi(struct netpoll_info *npinfo,
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
return budget;
- npinfo->rx_flags |= NETPOLL_RX_DROP;
- atomic_inc(&trapped);
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
+ WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
- atomic_dec(&trapped);
- npinfo->rx_flags &= ~NETPOLL_RX_DROP;
return budget - work;
}
-static void poll_napi(struct net_device *dev)
+static void poll_napi(struct net_device *dev, int budget)
{
struct napi_struct *napi;
- int budget = 16;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner != smp_processor_id() &&
spin_trylock(&napi->poll_lock)) {
- budget = poll_one_napi(rcu_dereference_bh(dev->npinfo),
- napi, budget);
+ budget = poll_one_napi(napi, budget);
spin_unlock(&napi->poll_lock);
-
- if (!budget)
- break;
}
}
}
-static void service_neigh_queue(struct netpoll_info *npi)
-{
- if (npi) {
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&npi->neigh_tx)))
- netpoll_neigh_reply(skb, npi);
- }
-}
-
static void netpoll_poll_dev(struct net_device *dev)
{
const struct net_device_ops *ops;
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ int budget = 0;
/* Don't do any rx activity if the dev_lock mutex is held
* the dev_open/close paths use this to block netpoll activity
@@ -224,31 +207,14 @@ static void netpoll_poll_dev(struct net_device *dev)
/* Process pending work on NIC */
ops->ndo_poll_controller(dev);
- poll_napi(dev);
+ poll_napi(dev, budget);
up(&ni->dev_lock);
- if (dev->flags & IFF_SLAVE) {
- if (ni) {
- struct net_device *bond_dev;
- struct sk_buff *skb;
- struct netpoll_info *bond_ni;
-
- bond_dev = netdev_master_upper_dev_get_rcu(dev);
- bond_ni = rcu_dereference_bh(bond_dev->npinfo);
- while ((skb = skb_dequeue(&ni->neigh_tx))) {
- skb->dev = bond_dev;
- skb_queue_tail(&bond_ni->neigh_tx, skb);
- }
- }
- }
-
- service_neigh_queue(ni);
-
zap_completion_queue();
}
-void netpoll_rx_disable(struct net_device *dev)
+void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
int idx;
@@ -259,9 +225,9 @@ void netpoll_rx_disable(struct net_device *dev)
down(&ni->dev_lock);
srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_rx_disable);
+EXPORT_SYMBOL(netpoll_poll_disable);
-void netpoll_rx_enable(struct net_device *dev)
+void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
rcu_read_lock();
@@ -270,7 +236,7 @@ void netpoll_rx_enable(struct net_device *dev)
up(&ni->dev_lock);
rcu_read_unlock();
}
-EXPORT_SYMBOL(netpoll_rx_enable);
+EXPORT_SYMBOL(netpoll_poll_enable);
static void refill_skbs(void)
{
@@ -304,7 +270,7 @@ static void zap_completion_queue(void)
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
- if (skb->destructor) {
+ if (!skb_irq_freeable(skb)) {
atomic_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
} else {
@@ -359,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
{
int status = NETDEV_TX_BUSY;
unsigned long tries;
- const struct net_device_ops *ops = dev->netdev_ops;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
@@ -367,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
npinfo = rcu_dereference_bh(np->dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
- __kfree_skb(skb);
+ dev_kfree_skb_irq(skb);
return;
}
@@ -380,29 +345,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
- if (__netif_tx_trylock(txq)) {
- if (!netif_xmit_stopped(txq)) {
- if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(netif_skb_features(skb),
- skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
- if (unlikely(!skb)) {
- /* This is actually a packet drop, but we
- * don't want the code at the end of this
- * function to try and re-queue a NULL skb.
- */
- status = NETDEV_TX_OK;
- goto unlock_txq;
- }
- skb->vlan_tci = 0;
- }
-
- status = ops->ndo_start_xmit(skb, dev);
- if (status == NETDEV_TX_OK)
- txq_trans_update(txq);
- }
- unlock_txq:
- __netif_tx_unlock(txq);
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
+
+ HARD_TX_UNLOCK(dev, txq);
if (status == NETDEV_TX_OK)
break;
@@ -417,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
WARN_ONCE(!irqs_disabled(),
"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
- dev->name, ops->ndo_start_xmit);
+ dev->name, dev->netdev_ops->ndo_start_xmit);
}
@@ -529,384 +476,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
}
EXPORT_SYMBOL(netpoll_send_udp);
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
- int size, type = ARPOP_REPLY;
- __be32 sip, tip;
- unsigned char *sha;
- struct sk_buff *send_skb;
- struct netpoll *np, *tmp;
- unsigned long flags;
- int hlen, tlen;
- int hits = 0, proto;
-
- if (list_empty(&npinfo->rx_np))
- return;
-
- /* Before checking the packet, we do some early
- inspection whether this is interesting at all */
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->dev == skb->dev)
- hits++;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
- /* No netpoll struct is using this dev */
- if (!hits)
- return;
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto == ETH_P_ARP) {
- struct arphdr *arp;
- unsigned char *arp_ptr;
- /* No arp on this interface */
- if (skb->dev->flags & IFF_NOARP)
- return;
-
- if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
- return;
-
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- arp = arp_hdr(skb);
-
- if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
- arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_op != htons(ARPOP_REQUEST))
- return;
-
- arp_ptr = (unsigned char *)(arp+1);
- /* save the location of the src hw addr */
- sha = arp_ptr;
- arp_ptr += skb->dev->addr_len;
- memcpy(&sip, arp_ptr, 4);
- arp_ptr += 4;
- /* If we actually cared about dst hw addr,
- it would get copied here */
- arp_ptr += skb->dev->addr_len;
- memcpy(&tip, arp_ptr, 4);
-
- /* Should we ignore arp? */
- if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
- return;
-
- size = arp_hdr_len(skb->dev);
-
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (tip != np->local_ip.ip)
- continue;
-
- hlen = LL_RESERVED_SPACE(np->dev);
- tlen = np->dev->needed_tailroom;
- send_skb = find_skb(np, size + hlen + tlen, hlen);
- if (!send_skb)
- continue;
-
- skb_reset_network_header(send_skb);
- arp = (struct arphdr *) skb_put(send_skb, size);
- send_skb->dev = skb->dev;
- send_skb->protocol = htons(ETH_P_ARP);
-
- /* Fill the device header for the ARP frame */
- if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP,
- sha, np->dev->dev_addr,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- continue;
- }
-
- /*
- * Fill out the arp protocol part.
- *
- * we only support ethernet device type,
- * which (according to RFC 1390) should
- * always equal 1 (Ethernet).
- */
-
- arp->ar_hrd = htons(np->dev->type);
- arp->ar_pro = htons(ETH_P_IP);
- arp->ar_hln = np->dev->addr_len;
- arp->ar_pln = 4;
- arp->ar_op = htons(type);
-
- arp_ptr = (unsigned char *)(arp + 1);
- memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &tip, 4);
- arp_ptr += 4;
- memcpy(arp_ptr, sha, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &sip, 4);
-
- netpoll_send_skb(np, send_skb);
-
- /* If there are several rx_skb_hooks for the same
- * address we're fine by sending a single reply
- */
- break;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- } else if( proto == ETH_P_IPV6) {
-#if IS_ENABLED(CONFIG_IPV6)
- struct nd_msg *msg;
- u8 *lladdr = NULL;
- struct ipv6hdr *hdr;
- struct icmp6hdr *icmp6h;
- const struct in6_addr *saddr;
- const struct in6_addr *daddr;
- struct inet6_dev *in6_dev = NULL;
- struct in6_addr *target;
-
- in6_dev = in6_dev_get(skb->dev);
- if (!in6_dev || !in6_dev->cnf.accept_ra)
- return;
-
- if (!pskb_may_pull(skb, skb->len))
- return;
-
- msg = (struct nd_msg *)skb_transport_header(skb);
-
- __skb_push(skb, skb->data - skb_transport_header(skb));
-
- if (ipv6_hdr(skb)->hop_limit != 255)
- return;
- if (msg->icmph.icmp6_code != 0)
- return;
- if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
- return;
-
- saddr = &ipv6_hdr(skb)->saddr;
- daddr = &ipv6_hdr(skb)->daddr;
-
- size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
-
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (!ipv6_addr_equal(daddr, &np->local_ip.in6))
- continue;
-
- hlen = LL_RESERVED_SPACE(np->dev);
- tlen = np->dev->needed_tailroom;
- send_skb = find_skb(np, size + hlen + tlen, hlen);
- if (!send_skb)
- continue;
-
- send_skb->protocol = htons(ETH_P_IPV6);
- send_skb->dev = skb->dev;
-
- skb_reset_network_header(send_skb);
- hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr));
- *(__be32*)hdr = htonl(0x60000000);
- hdr->payload_len = htons(size);
- hdr->nexthdr = IPPROTO_ICMPV6;
- hdr->hop_limit = 255;
- hdr->saddr = *saddr;
- hdr->daddr = *daddr;
-
- icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr));
- icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
- icmp6h->icmp6_router = 0;
- icmp6h->icmp6_solicited = 1;
-
- target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr));
- *target = msg->target;
- icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size,
- IPPROTO_ICMPV6,
- csum_partial(icmp6h,
- size, 0));
-
- if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6,
- lladdr, np->dev->dev_addr,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- continue;
- }
-
- netpoll_send_skb(np, send_skb);
-
- /* If there are several rx_skb_hooks for the same
- * address, we're fine by sending a single reply
- */
- break;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-#endif
- }
-}
-
-static bool pkt_is_ns(struct sk_buff *skb)
-{
- struct nd_msg *msg;
- struct ipv6hdr *hdr;
-
- if (skb->protocol != htons(ETH_P_ARP))
- return false;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
- return false;
-
- msg = (struct nd_msg *)skb_transport_header(skb);
- __skb_push(skb, skb->data - skb_transport_header(skb));
- hdr = ipv6_hdr(skb);
-
- if (hdr->nexthdr != IPPROTO_ICMPV6)
- return false;
- if (hdr->hop_limit != 255)
- return false;
- if (msg->icmph.icmp6_code != 0)
- return false;
- if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
- return false;
-
- return true;
-}
-
-int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
- int proto, len, ulen, data_len;
- int hits = 0, offset;
- const struct iphdr *iph;
- struct udphdr *uh;
- struct netpoll *np, *tmp;
- uint16_t source;
-
- if (list_empty(&npinfo->rx_np))
- goto out;
-
- if (skb->dev->type != ARPHRD_ETHER)
- goto out;
-
- /* check if netpoll clients need ARP */
- if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) {
- skb_queue_tail(&npinfo->neigh_tx, skb);
- return 1;
- } else if (pkt_is_ns(skb) && atomic_read(&trapped)) {
- skb_queue_tail(&npinfo->neigh_tx, skb);
- return 1;
- }
-
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
- skb = vlan_untag(skb);
- if (unlikely(!skb))
- goto out;
- }
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto != ETH_P_IP && proto != ETH_P_IPV6)
- goto out;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto out;
- if (skb_shared(skb))
- goto out;
-
- if (proto == ETH_P_IP) {
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
- iph = (struct iphdr *)skb->data;
- if (iph->ihl < 5 || iph->version != 4)
- goto out;
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto out;
- iph = (struct iphdr *)skb->data;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto out;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < iph->ihl*4)
- goto out;
-
- /*
- * Our transport medium may have padded the buffer out.
- * Now We trim to the true length of the frame.
- */
- if (pskb_trim_rcsum(skb, len))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- if (iph->protocol != IPPROTO_UDP)
- goto out;
-
- len -= iph->ihl*4;
- uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
- offset = (unsigned char *)(uh + 1) - skb->data;
- ulen = ntohs(uh->len);
- data_len = skb->len - offset;
- source = ntohs(uh->source);
-
- if (ulen != len)
- goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
- goto out;
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->local_ip.ip && np->local_ip.ip != iph->daddr)
- continue;
- if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr)
- continue;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- continue;
-
- np->rx_skb_hook(np, source, skb, offset, data_len);
- hits++;
- }
- } else {
-#if IS_ENABLED(CONFIG_IPV6)
- const struct ipv6hdr *ip6h;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto out;
- ip6h = (struct ipv6hdr *)skb->data;
- if (ip6h->version != 6)
- goto out;
- len = ntohs(ip6h->payload_len);
- if (!len)
- goto out;
- if (len + sizeof(struct ipv6hdr) > skb->len)
- goto out;
- if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr)))
- goto out;
- ip6h = ipv6_hdr(skb);
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto out;
- uh = udp_hdr(skb);
- offset = (unsigned char *)(uh + 1) - skb->data;
- ulen = ntohs(uh->len);
- data_len = skb->len - offset;
- source = ntohs(uh->source);
- if (ulen != skb->len)
- goto out;
- if (udp6_csum_init(skb, uh, IPPROTO_UDP))
- goto out;
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr))
- continue;
- if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr))
- continue;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- continue;
-
- np->rx_skb_hook(np, source, skb, offset, data_len);
- hits++;
- }
-#endif
- }
-
- if (!hits)
- goto out;
-
- kfree_skb(skb);
- return 1;
-
-out:
- if (atomic_read(&trapped)) {
- kfree_skb(skb);
- return 1;
- }
-
- return 0;
-}
-
void netpoll_print_options(struct netpoll *np)
{
np_info(np, "local port %d\n", np->local_port);
@@ -948,6 +517,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
int ipv6;
+ bool ipversion_set = false;
if (*cur != '@') {
if ((delim = strchr(cur, '@')) == NULL)
@@ -960,6 +530,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
cur++;
if (*cur != '/') {
+ ipversion_set = true;
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
@@ -1002,7 +573,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
if (ipv6 < 0)
goto parse_failed;
- else if (np->ipv6 != (bool)ipv6)
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
goto parse_failed;
else
np->ipv6 = (bool)ipv6;
@@ -1024,11 +595,10 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
}
EXPORT_SYMBOL(netpoll_parse_options);
-int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
- unsigned long flags;
int err;
np->dev = ndev;
@@ -1044,18 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
}
if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), gfp);
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
goto out;
}
- npinfo->rx_flags = 0;
- INIT_LIST_HEAD(&npinfo->rx_np);
-
- spin_lock_init(&npinfo->rx_lock);
sema_init(&npinfo->dev_lock, 1);
- skb_queue_head_init(&npinfo->neigh_tx);
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
@@ -1063,7 +628,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo, gfp);
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
if (err)
goto free_npinfo;
}
@@ -1074,13 +639,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
npinfo->netpoll = np;
- if (np->rx_skb_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- list_add_tail(&np->rx, &npinfo->rx_np);
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -1202,7 +760,7 @@ int netpoll_setup(struct netpoll *np)
/* fill up the skb queue */
refill_skbs();
- err = __netpoll_setup(np, ndev, GFP_KERNEL);
+ err = __netpoll_setup(np, ndev);
if (err)
goto put;
@@ -1229,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
struct netpoll_info *npinfo =
container_of(rcu_head, struct netpoll_info, rcu);
- skb_queue_purge(&npinfo->neigh_tx);
skb_queue_purge(&npinfo->txq);
/* we can't call cancel_delayed_work_sync here, as we are in softirq */
@@ -1245,7 +802,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- unsigned long flags;
/* rtnl_dereference would be preferable here but
* rcu_cleanup_netpoll path can put us in here safely without
@@ -1255,14 +811,6 @@ void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
- if (!list_empty(&npinfo->rx_np)) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_del(&np->rx);
- if (list_empty(&npinfo->rx_np))
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
synchronize_srcu(&netpoll_srcu);
if (atomic_dec_and_test(&npinfo->refcnt)) {
@@ -1272,7 +820,7 @@ void __netpoll_cleanup(struct netpoll *np)
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
- rcu_assign_pointer(np->dev->npinfo, NULL);
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
}
}
@@ -1306,18 +854,3 @@ out:
rtnl_unlock();
}
EXPORT_SYMBOL(netpoll_cleanup);
-
-int netpoll_trap(void)
-{
- return atomic_read(&trapped);
-}
-EXPORT_SYMBOL(netpoll_trap);
-
-void netpoll_set_trap(int trap)
-{
- if (trap)
- atomic_inc(&trapped);
- else
- atomic_dec(&trapped);
-}
-EXPORT_SYMBOL(netpoll_set_trap);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd0..2f385b9bccc 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
- struct cgroup_subsys_state *parent_css = css_parent(css);
+ struct cgroup_subsys_state *parent_css = css->parent;
struct net_device *dev;
int ret = 0;
@@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v)
return 0;
}
-static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
int ret;
- if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
return -EINVAL;
dev = dev_get_by_name(&init_net, devname);
@@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
rtnl_lock();
- ret = netprio_set_prio(css, dev, prio);
+ ret = netprio_set_prio(of_css(of), dev, prio);
rtnl_unlock();
dev_put(dev);
- return ret;
+ return ret ?: nbytes;
}
static int update_netprio(const void *v, struct file *file, unsigned n)
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
struct task_struct *p;
void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
@@ -239,20 +239,17 @@ static struct cftype ss_files[] = {
{
.name = "ifpriomap",
.seq_show = read_priomap,
- .write_string = write_priomap,
+ .write = write_priomap,
},
{ } /* terminate */
};
-struct cgroup_subsys net_prio_subsys = {
- .name = "net_prio",
+struct cgroup_subsys net_prio_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = net_prio_attach,
- .subsys_id = net_prio_subsys_id,
.base_cftypes = ss_files,
- .module = THIS_MODULE,
};
static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
static int __init init_cgroup_netprio(void)
{
- int ret;
-
- ret = cgroup_load_subsys(&net_prio_subsys);
- if (ret)
- goto out;
-
register_netdevice_notifier(&netprio_device_notifier);
-
-out:
- return ret;
-}
-
-static void __exit exit_cgroup_netprio(void)
-{
- struct netprio_map *old;
- struct net_device *dev;
-
- unregister_netdevice_notifier(&netprio_device_notifier);
-
- cgroup_unload_subsys(&net_prio_subsys);
-
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- old = rtnl_dereference(dev->priomap);
- RCU_INIT_POINTER(dev->priomap, NULL);
- if (old)
- kfree_rcu(old, rcu);
- }
- rtnl_unlock();
+ return 0;
}
-module_init(init_cgroup_netprio);
-module_exit(exit_cgroup_netprio);
+subsys_initcall(init_cgroup_netprio);
MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fdac61cac1b..fc17a9d309a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -476,23 +476,22 @@ static int pgctrl_show(struct seq_file *seq, void *v)
static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- int err = 0;
char data[128];
struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
- if (!capable(CAP_NET_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
if (count > sizeof(data))
count = sizeof(data);
- if (copy_from_user(data, buf, count)) {
- err = -EFAULT;
- goto out;
- }
- data[count - 1] = 0; /* Make string */
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
pktgen_stop_all_threads_ifs(pn);
@@ -506,10 +505,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
else
pr_warning("Unknown command: %s\n", data);
- err = count;
-
-out:
- return err;
+ return count;
}
static int pgctrl_open(struct inode *inode, struct file *file)
@@ -577,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
is_zero_ether_addr(pkt_dev->src_mac) ?
pkt_dev->odev->dev_addr : pkt_dev->src_mac);
- seq_printf(seq, "dst_mac: ");
+ seq_puts(seq, "dst_mac: ");
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
@@ -592,7 +588,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->nr_labels) {
unsigned int i;
- seq_printf(seq, " mpls: ");
+ seq_puts(seq, " mpls: ");
for (i = 0; i < pkt_dev->nr_labels; i++)
seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
i == pkt_dev->nr_labels-1 ? "\n" : ", ");
@@ -617,67 +613,67 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
- seq_printf(seq, " Flags: ");
+ seq_puts(seq, " Flags: ");
if (pkt_dev->flags & F_IPV6)
- seq_printf(seq, "IPV6 ");
+ seq_puts(seq, "IPV6 ");
if (pkt_dev->flags & F_IPSRC_RND)
- seq_printf(seq, "IPSRC_RND ");
+ seq_puts(seq, "IPSRC_RND ");
if (pkt_dev->flags & F_IPDST_RND)
- seq_printf(seq, "IPDST_RND ");
+ seq_puts(seq, "IPDST_RND ");
if (pkt_dev->flags & F_TXSIZE_RND)
- seq_printf(seq, "TXSIZE_RND ");
+ seq_puts(seq, "TXSIZE_RND ");
if (pkt_dev->flags & F_UDPSRC_RND)
- seq_printf(seq, "UDPSRC_RND ");
+ seq_puts(seq, "UDPSRC_RND ");
if (pkt_dev->flags & F_UDPDST_RND)
- seq_printf(seq, "UDPDST_RND ");
+ seq_puts(seq, "UDPDST_RND ");
if (pkt_dev->flags & F_UDPCSUM)
- seq_printf(seq, "UDPCSUM ");
+ seq_puts(seq, "UDPCSUM ");
if (pkt_dev->flags & F_MPLS_RND)
- seq_printf(seq, "MPLS_RND ");
+ seq_puts(seq, "MPLS_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_RND)
- seq_printf(seq, "QUEUE_MAP_RND ");
+ seq_puts(seq, "QUEUE_MAP_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
- seq_printf(seq, "QUEUE_MAP_CPU ");
+ seq_puts(seq, "QUEUE_MAP_CPU ");
if (pkt_dev->cflows) {
if (pkt_dev->flags & F_FLOW_SEQ)
- seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/
+ seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/
else
- seq_printf(seq, "FLOW_RND ");
+ seq_puts(seq, "FLOW_RND ");
}
#ifdef CONFIG_XFRM
if (pkt_dev->flags & F_IPSEC_ON) {
- seq_printf(seq, "IPSEC ");
+ seq_puts(seq, "IPSEC ");
if (pkt_dev->spi)
seq_printf(seq, "spi:%u", pkt_dev->spi);
}
#endif
if (pkt_dev->flags & F_MACSRC_RND)
- seq_printf(seq, "MACSRC_RND ");
+ seq_puts(seq, "MACSRC_RND ");
if (pkt_dev->flags & F_MACDST_RND)
- seq_printf(seq, "MACDST_RND ");
+ seq_puts(seq, "MACDST_RND ");
if (pkt_dev->flags & F_VID_RND)
- seq_printf(seq, "VID_RND ");
+ seq_puts(seq, "VID_RND ");
if (pkt_dev->flags & F_SVID_RND)
- seq_printf(seq, "SVID_RND ");
+ seq_puts(seq, "SVID_RND ");
if (pkt_dev->flags & F_NODE)
- seq_printf(seq, "NODE_ALLOC ");
+ seq_puts(seq, "NODE_ALLOC ");
seq_puts(seq, "\n");
@@ -720,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->result[0])
seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- seq_printf(seq, "Result: Idle\n");
+ seq_puts(seq, "Result: Idle\n");
return 0;
}
@@ -1251,7 +1247,13 @@ static ssize_t pktgen_if_write(struct file *file,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
f,
"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n");
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
return count;
}
sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
@@ -1733,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
BUG_ON(!t);
- seq_printf(seq, "Running: ");
+ seq_puts(seq, "Running: ");
if_lock(t);
list_for_each_entry(pkt_dev, &t->if_list, list)
if (pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
- seq_printf(seq, "\nStopped: ");
+ seq_puts(seq, "\nStopped: ");
list_for_each_entry(pkt_dev, &t->if_list, list)
if (!pkt_dev->running)
@@ -1749,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
if (t->result[0])
seq_printf(seq, "\nResult: %s\n", t->result);
else
- seq_printf(seq, "\nResult: NA\n");
+ seq_puts(seq, "\nResult: NA\n");
if_unlock(t);
@@ -3336,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
queue_map = skb_get_queue_mapping(pkt_dev->skb);
txq = netdev_get_tx_queue(odev, queue_map);
- __netif_tx_lock_bh(txq);
+ local_bh_disable();
- if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
@@ -3372,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
}
unlock:
- __netif_tx_unlock_bh(txq);
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 00000000000..d3027a73fd4
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,141 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x40 ; PTP_CLASS_V2_VLAN
+ * ret a ; return PTP class
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x30 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct sk_filter *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return SK_RUN_FILTER(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 15, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 12, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000040 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000030 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 4425148d2b5..467f326126e 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -221,5 +221,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
out:
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
- return;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 393b1bc9a61..1063996f831 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -353,15 +353,46 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)
}
EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
/**
* rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
- rtnl_lock();
+ /* Close the race with cleanup_net() */
+ mutex_lock(&net_mutex);
+ rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
+ mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
@@ -374,7 +405,7 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
if (!master_dev)
return 0;
ops = master_dev->rtnl_link_ops;
- if (!ops->get_slave_size)
+ if (!ops || !ops->get_slave_size)
return 0;
/* IFLA_INFO_SLAVE_DATA + nested data */
return nla_total_size(sizeof(struct nlattr)) +
@@ -767,14 +798,15 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
- nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
- nla_total_size(sizeof(struct ifla_vf_spoofchk)));
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)));
return size;
} else
return 0;
}
-static size_t rtnl_port_size(const struct net_device *dev)
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
size_t port_size = nla_total_size(4) /* PORT_VF */
+ nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
@@ -790,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ port_size;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
if (dev_num_vf(dev->dev.parent))
return port_self_size + vf_ports_size +
@@ -822,10 +855,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
- + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
@@ -887,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
{
int err;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
err = rtnl_port_self_fill(skb, dev);
@@ -970,7 +1006,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
(dev->qdisc &&
nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
(dev->ifalias &&
- nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)))
+ nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_changes)))
goto nla_put_failure;
if (1) {
@@ -1027,6 +1065,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct ifla_vf_info ivi;
struct ifla_vf_mac vf_mac;
struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_link_state vf_linkstate;
@@ -1047,6 +1086,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
break;
vf_mac.vf =
vf_vlan.vf =
+ vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
vf_linkstate.vf = ivi.vf;
@@ -1054,7 +1094,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
- vf_tx_rate.rate = ivi.tx_rate;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
vf_spoofchk.setting = ivi.spoofchk;
vf_linkstate.link_state = ivi.linkstate;
vf = nla_nest_start(skb, IFLA_VF_INFO);
@@ -1064,6 +1106,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
}
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
&vf_tx_rate) ||
nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
@@ -1076,7 +1120,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_nest_end(skb, vfinfo);
}
- if (rtnl_port_fill(skb, dev))
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
goto nla_put_failure;
if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
@@ -1121,56 +1165,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
- struct nlattr *tb[IFLA_MAX+1];
- u32 ext_filter_mask = 0;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- rcu_read_lock();
- cb->seq = net->dev_base_seq;
-
- if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
- ifla_policy) >= 0) {
-
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
- }
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI,
- ext_filter_mask) <= 0)
- goto out;
-
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
- }
- }
-out:
- rcu_read_unlock();
- cb->args[1] = idx;
- cb->args[0] = h;
-
- return skb->len;
-}
-
-const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1196,8 +1191,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
};
-EXPORT_SYMBOL(ifla_policy);
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
[IFLA_INFO_KIND] = { .type = NLA_STRING },
@@ -1219,6 +1214,10 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
.len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY,
.len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_link_state) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1235,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ int err;
+ int hdrlen;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rcu_read_lock();
+ cb->seq = net->dev_base_seq;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI,
+ ext_filter_mask);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err <= 0)
+ goto out;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
+{
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
struct net *net;
@@ -1316,11 +1387,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
}
case IFLA_VF_TX_RATE: {
struct ifla_vf_tx_rate *ivt;
+ struct ifla_vf_info ivf;
ivt = nla_data(vf);
err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_tx_rate)
- err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
- ivt->rate);
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf,
+ &ivf);
+ if (err)
+ break;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ break;
+ }
+ case IFLA_VF_RATE: {
+ struct ifla_vf_rate *ivt;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
break;
}
case IFLA_VF_SPOOFCHK: {
@@ -1386,7 +1475,8 @@ static int do_set_master(struct net_device *dev, int ifindex)
return 0;
}
-static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1398,7 +1488,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
err = PTR_ERR(net);
goto errout;
}
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
goto errout;
}
@@ -1652,7 +1742,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
- err = do_setlink(dev, ifm, tb, ifname, 0);
+ err = do_setlink(skb, dev, ifm, tb, ifname, 0);
errout:
return err;
}
@@ -1692,7 +1782,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
ops->dellink(dev, &list_kill);
unregister_netdevice_many(&list_kill);
- list_del(&list_kill);
return 0;
}
@@ -1769,7 +1858,8 @@ err:
}
EXPORT_SYMBOL(rtnl_create_link);
-static int rtnl_group_changelink(struct net *net, int group,
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
struct ifinfomsg *ifm,
struct nlattr **tb)
{
@@ -1778,7 +1868,7 @@ static int rtnl_group_changelink(struct net *net, int group,
for_each_netdev(net, dev) {
if (dev->group == group) {
- err = do_setlink(dev, ifm, tb, NULL, 0);
+ err = do_setlink(skb, dev, ifm, tb, NULL, 0);
if (err < 0)
return err;
}
@@ -1920,12 +2010,12 @@ replay:
modified = 1;
}
- return do_setlink(dev, ifm, tb, ifname, modified);
+ return do_setlink(skb, dev, ifm, tb, ifname, modified);
}
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
- return rtnl_group_changelink(net,
+ return rtnl_group_changelink(skb, net,
nla_get_u32(tb[IFLA_GROUP]),
ifm, tb);
return -ENODEV;
@@ -1963,16 +2053,25 @@ replay:
dev->ifindex = ifm->ifi_index;
- if (ops->newlink)
+ if (ops->newlink) {
err = ops->newlink(net, dev, tb, data);
- else
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(dev);
+ goto out;
+ }
+ } else {
err = register_netdevice(dev);
-
- if (err < 0) {
- free_netdev(dev);
- goto out;
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
}
-
err = rtnl_configure_link(dev, ifm);
if (err < 0)
unregister_netdevice(dev);
@@ -2037,9 +2136,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
- ifla_policy) >= 0) {
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
}
@@ -2116,12 +2219,13 @@ EXPORT_SYMBOL(rtmsg_ifinfo);
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
struct net_device *dev,
u8 *addr, u32 pid, u32 seq,
- int type, unsigned int flags)
+ int type, unsigned int flags,
+ int nlflags)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
if (!nlh)
return -EMSGSIZE;
@@ -2159,7 +2263,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
if (!skb)
goto errout;
- err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -2306,7 +2410,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
int err = -EINVAL;
__u8 *addr;
- if (!capable(CAP_NET_ADMIN))
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2384,7 +2488,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
portid, seq,
- RTM_NEWNEIGH, NTF_SELF);
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI);
if (err < 0)
return err;
skip:
@@ -2757,7 +2862,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sz_idx = type>>2;
kind = type&3;
- if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 897da56f3af..ba71212f025 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -85,31 +85,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-__u32 secure_ip_id(__be32 daddr)
-{
- u32 hash[MD5_DIGEST_WORDS];
-
- net_secret_init();
- hash[0] = (__force __u32) daddr;
- hash[1] = net_secret[13];
- hash[2] = net_secret[14];
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- return hash[0];
-}
-
-__u32 secure_ipv6_id(const __be32 daddr[4])
-{
- __u32 hash[4];
-
- net_secret_init();
- memcpy(hash, daddr, 16);
- md5_transform(hash, net_secret);
-
- return hash[0];
-}
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5976ef0846b..c1a33033cbe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -689,12 +689,15 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->ooo_okay = old->ooo_okay;
new->no_fcs = old->no_fcs;
new->encapsulation = old->encapsulation;
+ new->encap_hdr_csum = old->encap_hdr_csum;
+ new->csum_valid = old->csum_valid;
+ new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
memcpy(new->cb, old->cb, sizeof(old->cb));
new->csum = old->csum;
- new->local_df = old->local_df;
+ new->ignore_df = old->ignore_df;
new->pkt_type = old->pkt_type;
new->ip_summed = old->ip_summed;
skb_copy_queue_mapping(new, old);
@@ -707,9 +710,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->mark = old->mark;
new->skb_iif = old->skb_iif;
__nf_copy(new, old);
-#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
- new->nf_trace = old->nf_trace;
-#endif
#ifdef CONFIG_NET_SCHED
new->tc_index = old->tc_index;
#ifdef CONFIG_NET_CLS_ACT
@@ -954,10 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * __pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
* @headroom: headroom of new skb
* @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
@@ -967,11 +970,12 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
{
unsigned int size = skb_headlen(skb) + headroom;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
@@ -1011,7 +1015,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(__pskb_copy);
+EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -2130,25 +2134,31 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
*
* The `hlen` as calculated by skb_zerocopy_headlen() specifies the
* headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
*/
-void
-skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
int i, j = 0;
int plen = 0; /* length of skb->head fragment */
+ int ret;
struct page *page;
unsigned int offset;
BUG_ON(!from->head_frag && !hlen);
/* dont bother with small payloads */
- if (len <= skb_tailroom(to)) {
- skb_copy_bits(from, 0, skb_put(to, len), len);
- return;
- }
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
if (hlen) {
- skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
len -= hlen;
} else {
plen = min_t(int, skb_headlen(from), len);
@@ -2166,6 +2176,11 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
to->len += len + plen;
to->data_len += len + plen;
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
if (!len)
break;
@@ -2176,6 +2191,8 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
j++;
}
skb_shinfo(to)->nr_frags = j;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);
@@ -2841,81 +2858,87 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
/**
* skb_segment - Perform protocol segmentation on skb.
- * @skb: buffer to segment
+ * @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
- struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
- skb_frag_t *skb_frag = skb_shinfo(skb)->frags;
- unsigned int mss = skb_shinfo(skb)->gso_size;
- unsigned int doffset = skb->data - skb_mac_header(skb);
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ skb_frag_t *frag = skb_shinfo(head_skb)->frags;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+ struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
- unsigned int tnl_hlen = skb_tnl_header_len(skb);
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int headroom;
unsigned int len;
__be16 proto;
bool csum;
int sg = !!(features & NETIF_F_SG);
- int nfrags = skb_shinfo(skb)->nr_frags;
+ int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
+ int dummy;
- proto = skb_network_protocol(skb);
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
- csum = !!can_checksum_protocol(features, proto);
- __skb_push(skb, doffset);
- headroom = skb_headroom(skb);
- pos = skb_headlen(skb);
+ csum = !head_skb->encap_hdr_csum &&
+ !!can_checksum_protocol(features, proto);
+
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
do {
struct sk_buff *nskb;
- skb_frag_t *frag;
+ skb_frag_t *nskb_frag;
int hsize;
int size;
- len = skb->len - offset;
+ len = head_skb->len - offset;
if (len > mss)
len = mss;
- hsize = skb_headlen(skb) - offset;
+ hsize = skb_headlen(head_skb) - offset;
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg)
hsize = len;
- if (!hsize && i >= nfrags && skb_headlen(fskb) &&
- (skb_headlen(fskb) == len || sg)) {
- BUG_ON(skb_headlen(fskb) > len);
+ if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
i = 0;
- nfrags = skb_shinfo(fskb)->nr_frags;
- skb_frag = skb_shinfo(fskb)->frags;
- pos += skb_headlen(fskb);
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
while (pos < offset + len) {
BUG_ON(i >= nfrags);
- size = skb_frag_size(skb_frag);
+ size = skb_frag_size(frag);
if (pos + size > offset + len)
break;
i++;
pos += size;
- skb_frag++;
+ frag++;
}
- nskb = skb_clone(fskb, GFP_ATOMIC);
- fskb = fskb->next;
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
@@ -2936,7 +2959,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
__skb_push(nskb, doffset);
} else {
nskb = __alloc_skb(hsize + doffset + headroom,
- GFP_ATOMIC, skb_alloc_rx_flag(skb),
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
if (unlikely(!nskb))
@@ -2952,12 +2975,12 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
segs = nskb;
tail = nskb;
- __copy_skb_header(nskb, skb);
- nskb->mac_len = skb->mac_len;
+ __copy_skb_header(nskb, head_skb);
+ nskb->mac_len = head_skb->mac_len;
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
- skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen,
doffset + tnl_hlen);
@@ -2966,30 +2989,34 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
if (!sg) {
nskb->ip_summed = CHECKSUM_NONE;
- nskb->csum = skb_copy_and_csum_bits(skb, offset,
+ nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
continue;
}
- frag = skb_shinfo(nskb)->frags;
+ nskb_frag = skb_shinfo(nskb)->frags;
- skb_copy_from_linear_data_offset(skb, offset,
+ skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
+ skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
while (pos < offset + len) {
if (i >= nfrags) {
- BUG_ON(skb_headlen(fskb));
+ BUG_ON(skb_headlen(list_skb));
i = 0;
- nfrags = skb_shinfo(fskb)->nr_frags;
- skb_frag = skb_shinfo(fskb)->frags;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
BUG_ON(!nfrags);
- fskb = fskb->next;
+ list_skb = list_skb->next;
}
if (unlikely(skb_shinfo(nskb)->nr_frags >=
@@ -3000,27 +3027,30 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
goto err;
}
- *frag = *skb_frag;
- __skb_frag_ref(frag);
- size = skb_frag_size(frag);
+ if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
+ goto err;
+
+ *nskb_frag = *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
if (pos < offset) {
- frag->page_offset += offset - pos;
- skb_frag_size_sub(frag, offset - pos);
+ nskb_frag->page_offset += offset - pos;
+ skb_frag_size_sub(nskb_frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) {
i++;
- skb_frag++;
+ frag++;
pos += size;
} else {
- skb_frag_size_sub(frag, pos + size - (offset + len));
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
goto skip_fraglist;
}
- frag++;
+ nskb_frag++;
}
skip_fraglist:
@@ -3033,8 +3063,10 @@ perform_csum_check:
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
}
- } while ((offset += len) < skb->len);
+ } while ((offset += len) < head_skb->len);
return segs;
@@ -3057,7 +3089,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
if (unlikely(p->len + len >= 65536))
return -E2BIG;
- lp = NAPI_GRO_CB(p)->last ?: p;
+ lp = NAPI_GRO_CB(p)->last;
pinfo = skb_shinfo(lp);
if (headlen <= offset) {
@@ -3173,7 +3205,7 @@ merge:
__skb_pull(skb, offset);
- if (!NAPI_GRO_CB(p)->last)
+ if (NAPI_GRO_CB(p)->last == p)
skb_shinfo(p)->frag_list = skb;
else
NAPI_GRO_CB(p)->last->next = skb;
@@ -3281,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
return elt;
}
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
int nsg = __skb_to_sgvec(skb, sg, offset, len);
@@ -3413,8 +3471,6 @@ static void sock_rmem_free(struct sk_buff *skb)
*/
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
- int len = skb->len;
-
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
@@ -3429,7 +3485,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -3543,15 +3599,47 @@ static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
return 0;
}
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
/* This value should be large enough to cover a tagged ethernet header plus
* maximally sized IP and TCP or UDP headers.
*/
#define MAX_IP_HDR_LEN 128
-static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate)
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
unsigned int off;
bool fragment;
+ __sum16 *csum;
int err;
fragment = false;
@@ -3572,51 +3660,15 @@ static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate)
if (fragment)
goto out;
- switch (ip_hdr(skb)->protocol) {
- case IPPROTO_TCP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct tcphdr),
- MAX_IP_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct tcphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- tcp_hdr(skb)->check =
- ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_TCP, 0);
- break;
- case IPPROTO_UDP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct udphdr),
- MAX_IP_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct udphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- udp_hdr(skb)->check =
- ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_UDP, 0);
- break;
- default:
- goto out;
- }
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
err = 0;
out:
@@ -3639,6 +3691,7 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
unsigned int len;
bool fragment;
bool done;
+ __sum16 *csum;
fragment = false;
done = false;
@@ -3716,51 +3769,14 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
if (!done || fragment)
goto out;
- switch (nexthdr) {
- case IPPROTO_TCP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct tcphdr),
- MAX_IPV6_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct tcphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- tcp_hdr(skb)->check =
- ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_TCP, 0);
- break;
- case IPPROTO_UDP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct udphdr),
- MAX_IPV6_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct udphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- udp_hdr(skb)->check =
- ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_UDP, 0);
- break;
- default:
- goto out;
- }
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
err = 0;
out:
@@ -3778,7 +3794,7 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
switch (skb->protocol) {
case htons(ETH_P_IP):
- err = skb_checksum_setup_ip(skb, recalculate);
+ err = skb_checksum_setup_ipv4(skb, recalculate);
break;
case htons(ETH_P_IPV6):
@@ -3910,7 +3926,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
- skb->local_df = 0;
+ skb->ignore_df = 0;
skb_dst_drop(skb);
skb->mark = 0;
secpath_reset(skb);
@@ -3932,12 +3948,14 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
- unsigned int hdr_len;
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- hdr_len = tcp_hdrlen(skb);
- else
- hdr_len = sizeof(struct udphdr);
- return hdr_len + shinfo->gso_size;
+ return tcp_hdrlen(skb) + shinfo->gso_size;
+
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return shinfo->gso_size;
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
diff --git a/net/core/sock.c b/net/core/sock.c
index 0c127dcdf6a..026e01f7027 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,55 @@
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capbility to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socke was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
#ifdef CONFIG_MEMCG_KMEM
int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
@@ -428,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
@@ -735,7 +784,7 @@ set_rcvbuf:
break;
case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ sk->sk_no_check_tx = valbool;
break;
case SO_PRIORITY:
@@ -1015,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_NO_CHECK:
- v.val = sk->sk_no_check;
+ v.val = sk->sk_no_check_tx;
break;
case SO_PRIORITY:
@@ -1775,7 +1824,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
while (order) {
if (npages >= 1 << order) {
page = alloc_pages(sk->sk_allocation |
- __GFP_COMP | __GFP_NOWARN,
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
order);
if (page)
goto fill_page;
@@ -1845,7 +1896,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
gfp_t gfp = prio;
if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
+ gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
pfrag->page = alloc_pages(gfp, order);
if (likely(pfrag->page)) {
pfrag->offset = 0;
@@ -2194,7 +2245,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
@@ -2355,10 +2406,13 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
if (sk->sk_prot->release_cb)
sk->sk_prot->release_cb(sk);
- sk->sk_lock.owned = 0;
+ sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a0e9cf6379d..a4216a4c957 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -49,38 +49,35 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
}
EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
-int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
struct sk_buff *skb, int attrtype)
{
- struct nlattr *attr;
+ struct sock_fprog_kern *fprog;
struct sk_filter *filter;
- unsigned int len;
+ struct nlattr *attr;
+ unsigned int flen;
int err = 0;
- if (!ns_capable(user_ns, CAP_NET_ADMIN)) {
+ if (!may_report_filterinfo) {
nla_reserve(skb, attrtype, 0);
return 0;
}
rcu_read_lock();
-
filter = rcu_dereference(sk->sk_filter);
- len = filter ? filter->len * sizeof(struct sock_filter) : 0;
+ if (!filter)
+ goto out;
- attr = nla_reserve(skb, attrtype, len);
+ fprog = filter->orig_prog;
+ flen = sk_filter_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
if (attr == NULL) {
err = -EMSGSIZE;
goto out;
}
- if (filter) {
- struct sock_filter *fb = (struct sock_filter *)nla_data(attr);
- int i;
-
- for (i = 0; i < filter->len; i++, fb++)
- sk_decode_filter(&filter->insns[i], fb);
- }
-
+ memcpy(nla_data(attr), fprog->filter, flen);
out:
rcu_read_unlock();
return err;
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 661b5a40ec1..6521dfd8b7c 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -23,16 +23,11 @@
#include <linux/skbuff.h>
#include <linux/export.h>
-static struct sock_filter ptp_filter[] = {
- PTP_FILTER
-};
-
static unsigned int classify(const struct sk_buff *skb)
{
- if (likely(skb->dev &&
- skb->dev->phydev &&
+ if (likely(skb->dev && skb->dev->phydev &&
skb->dev->phydev->drv))
- return sk_run_filter(skb, ptp_filter);
+ return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
}
@@ -60,11 +55,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
if (likely(phydev->drv->txtstamp)) {
if (!atomic_inc_not_zero(&sk->sk_refcnt))
return;
+
clone = skb_clone(skb, GFP_ATOMIC);
if (!clone) {
sock_put(sk);
return;
}
+
clone->sk = sk;
phydev->drv->txtstamp(phydev, clone, type);
}
@@ -89,12 +86,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
*skb_hwtstamps(skb) = *hwtstamps;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
skb->sk = NULL;
+
err = sock_queue_err_skb(sk, skb);
+
sock_put(sk);
if (err)
kfree_skb(skb);
@@ -132,8 +132,3 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
return false;
}
EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
-
-void __init skb_timestamping_init(void)
-{
- BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
-}
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 00000000000..8c3203c585b
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,77 @@
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/tso.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ iph = (struct iphdr *)(hdr + mac_hdr_len);
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ tcph->seq = htonl(tso->tcp_seq);
+ tso->ip_id++;
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
index 2f737bf90b3..eed34338736 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -348,8 +348,8 @@ static void __net_random_once_deferred(struct work_struct *w)
{
struct __net_random_once_work *work =
container_of(w, struct __net_random_once_work, work);
- if (!static_key_enabled(work->key))
- static_key_slow_inc(work->key);
+ BUG_ON(!static_key_enabled(work->key));
+ static_key_slow_dec(work->key);
kfree(work);
}
@@ -367,7 +367,7 @@ static void __net_random_once_disable_jump(struct static_key *key)
}
bool __net_get_random_once(void *buf, int nbytes, bool *done,
- struct static_key *done_key)
+ struct static_key *once_key)
{
static DEFINE_SPINLOCK(lock);
unsigned long flags;
@@ -382,7 +382,7 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done,
*done = true;
spin_unlock_irqrestore(&lock, flags);
- __net_random_once_disable_jump(done_key);
+ __net_random_once_disable_jump(once_key);
return true;
}