aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile9
-rw-r--r--net/core/datagram.c181
-rw-r--r--net/core/dev.c4819
-rw-r--r--net/core/dev_addr_lists.c586
-rw-r--r--net/core/dev_ioctl.c567
-rw-r--r--net/core/drop_monitor.c200
-rw-r--r--net/core/dst.c174
-rw-r--r--net/core/ethtool.c1553
-rw-r--r--net/core/fib_rules.c112
-rw-r--r--net/core/filter.c1872
-rw-r--r--net/core/flow.c267
-rw-r--r--net/core/flow_dissector.c405
-rw-r--r--net/core/gen_estimator.c22
-rw-r--r--net/core/gen_stats.c25
-rw-r--r--net/core/iovec.c134
-rw-r--r--net/core/kmap_skb.h19
-rw-r--r--net/core/link_watch.c43
-rw-r--r--net/core/neighbour.c1161
-rw-r--r--net/core/net-procfs.c423
-rw-r--r--net/core/net-sysfs.c867
-rw-r--r--net/core/net-sysfs.h2
-rw-r--r--net/core/net-traces.c3
-rw-r--r--net/core/net_namespace.c213
-rw-r--r--net/core/netclassid_cgroup.c111
-rw-r--r--net/core/netevent.c1
-rw-r--r--net/core/netpoll.c825
-rw-r--r--net/core/netprio_cgroup.c288
-rw-r--r--net/core/pktgen.c1252
-rw-r--r--net/core/ptp_classifier.c141
-rw-r--r--net/core/request_sock.c103
-rw-r--r--net/core/rtnetlink.c1659
-rw-r--r--net/core/scm.c85
-rw-r--r--net/core/secure_seq.c173
-rw-r--r--net/core/skbuff.c1966
-rw-r--r--net/core/sock.c976
-rw-r--r--net/core/sock_diag.c231
-rw-r--r--net/core/stream.c4
-rw-r--r--net/core/sysctl_net_core.c227
-rw-r--r--net/core/timestamping.c40
-rw-r--r--net/core/tso.c77
-rw-r--r--net/core/user_dma.c7
-rw-r--r--net/core/utils.c105
42 files changed, 14825 insertions, 7103 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 8a04dd22cf7..71093d94ad2 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,15 +3,17 @@
#
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
+obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_NET_DMA) += user_dma.o
@@ -19,3 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 18ac112ea7a..488dd1a825c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -37,7 +37,6 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -49,6 +48,7 @@
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
+#include <linux/pagemap.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -57,6 +57,7 @@
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
+#include <net/busy_poll.h>
/*
* Is a socket 'connection oriented' ?
@@ -66,7 +67,7 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
-static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
+static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
void *key)
{
unsigned long bits = (unsigned long)key;
@@ -79,9 +80,10 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
return autoremove_wake_function(wait, mode, sync, key);
}
/*
- * Wait for a packet..
+ * Wait for the last received packet to be different from skb
*/
-static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
{
int error;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
@@ -93,7 +95,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
if (error)
goto out_err;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (sk->sk_receive_queue.prev != skb)
goto out;
/* Socket shut down? */
@@ -133,6 +135,8 @@ out_noerr:
* @sk: socket
* @flags: MSG_ flags
* @peeked: returns non-zero if this packet has been seen before
+ * @off: an offset in bytes to peek skb from. Returns an offset
+ * within an skb where data actually starts
* @err: error code returned
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
@@ -157,10 +161,10 @@ out_noerr:
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
- int *peeked, int *err)
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
@@ -180,28 +184,41 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
* However, this function was correct in any case. 8)
*/
unsigned long cpu_flags;
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
+ int _off = *off;
- spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb) {
+ last = (struct sk_buff *)queue;
+ spin_lock_irqsave(&queue->lock, cpu_flags);
+ skb_queue_walk(queue, skb) {
+ last = skb;
*peeked = skb->peeked;
if (flags & MSG_PEEK) {
+ if (_off >= skb->len && (skb->len || _off ||
+ skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
skb->peeked = 1;
atomic_inc(&skb->users);
} else
- __skb_unlink(skb, &sk->sk_receive_queue);
- }
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ __skb_unlink(skb, queue);
- if (skb)
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ *off = _off;
return skb;
+ }
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+
+ if (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, flags & MSG_DONTWAIT))
+ continue;
/* User doesn't want to wait */
error = -EAGAIN;
if (!timeo)
goto no_packet;
- } while (!wait_for_packet(sk, err, &timeo));
+ } while (!wait_for_more_packets(sk, err, &timeo, last));
return NULL;
@@ -211,13 +228,13 @@ no_packet:
}
EXPORT_SYMBOL(__skb_recv_datagram);
-struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int noblock, int *err)
{
- int peeked;
+ int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, err);
+ &peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -243,7 +260,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
unlock_sock_fast(sk, slow);
/* skb is now orphaned, can be freed outside of locked section */
- trace_kfree_skb(skb, skb_free_datagram_locked);
__kfree_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram_locked);
@@ -324,15 +340,15 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -410,15 +426,15 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -500,15 +516,15 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -558,6 +574,77 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+/**
+ * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ * @skb: buffer to copy
+ * @from: io vector to copy from
+ * @offset: offset in the io vector to start copying from
+ * @count: amount of vectors to copy to buffer from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ * Note: the iovec is not modified during the copy
+ */
+int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+ int offset, size_t count)
+{
+ int len = iov_length(from, count) - offset;
+ int copy = min_t(int, skb_headlen(skb), len);
+ int size;
+ int i = 0;
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
+ return -EFAULT;
+
+ if (len == copy)
+ return 0;
+
+ offset += copy;
+ while (count--) {
+ struct page *page[MAX_SKB_FRAGS];
+ int num_pages;
+ unsigned long base;
+ unsigned long truesize;
+
+ /* Skip over from offset and copied */
+ if (offset >= from->iov_len) {
+ offset -= from->iov_len;
+ ++from;
+ continue;
+ }
+ len = from->iov_len - offset;
+ base = (unsigned long)from->iov_base + offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (i + size > MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+ if (num_pages != size) {
+ release_pages(&page[i], num_pages, 0);
+ return -EFAULT;
+ }
+ truesize = size * PAGE_SIZE;
+ skb->data_len += len;
+ skb->len += len;
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ while (len) {
+ int off = base & ~PAGE_MASK;
+ int size = min_t(int, len, PAGE_SIZE - off);
+ skb_fill_page_desc(skb, i, page[i], off, size);
+ base += size;
+ len -= size;
+ i++;
+ }
+ offset = 0;
+ ++from;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iovec);
+
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
u8 __user *to, int len,
__wsum *csump)
@@ -585,16 +672,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
int err = 0;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -653,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
+ skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
- return __skb_checksum_complete_head(skb, skb->len);
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+
+ return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
@@ -745,7 +852,9 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a023bb..367a586d0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -73,7 +73,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -98,8 +97,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <linux/stat.h>
#include <net/dst.h>
#include <net/pkt_sched.h>
@@ -107,12 +104,10 @@
#include <net/xfrm.h>
#include <linux/highmem.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
-#include <net/wext.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
@@ -132,6 +127,11 @@
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
#include "net-sysfs.h"
@@ -141,40 +141,16 @@
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
-/*
- * The list of packet types we will receive (as opposed to discard)
- * and the routines to invoke.
- *
- * Why 16. Because with 16 the only overlap we get on a hash of the
- * low nibble of the protocol value is RARP/SNAP/X.25.
- *
- * NOTE: That is no longer true with the addition of VLAN tags. Not
- * sure which should go first, but I bet it won't make much
- * difference if we are running VLANs. The good news is that
- * this protocol won't be in the list unless compiled in, so
- * the average user (w/out VLANs) will not be adversely affected.
- * --BLG
- *
- * 0800 IP
- * 8100 802.1Q VLAN
- * 0001 802.3
- * 0002 AX.25
- * 0004 802.2
- * 8035 RARP
- * 0005 SNAP
- * 0805 X.25
- * 0806 ARP
- * 8137 IPX
- * 0009 Localtalk
- * 86DD IPv6
- */
-
-#define PTYPE_HASH_SIZE (16)
-#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
-
static DEFINE_SPINLOCK(ptype_lock);
-static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-static struct list_head ptype_all __read_mostly; /* Taps */
+static DEFINE_SPINLOCK(offload_lock);
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+struct list_head ptype_all __read_mostly; /* Taps */
+static struct list_head offload_base __read_mostly;
+
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -198,9 +174,23 @@ static struct list_head ptype_all __read_mostly; /* Taps */
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id;
+static DEFINE_HASHTABLE(napi_hash, 8);
+
+static seqcount_t devnet_rename_seq;
+
+static inline void dev_base_seq_inc(struct net *net)
+{
+ while (++net->dev_base_seq == 0);
+}
+
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+ unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}
@@ -224,7 +214,7 @@ static inline void rps_unlock(struct softnet_data *sd)
}
/* Device list insertion */
-static int list_netdevice(struct net_device *dev)
+static void list_netdevice(struct net_device *dev)
{
struct net *net = dev_net(dev);
@@ -236,7 +226,8 @@ static int list_netdevice(struct net_device *dev)
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
- return 0;
+
+ dev_base_seq_inc(net);
}
/* Device list removal
@@ -252,6 +243,8 @@ static void unlist_netdevice(struct net_device *dev)
hlist_del_rcu(&dev->name_hlist);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(dev_net(dev));
}
/*
@@ -286,10 +279,9 @@ static const unsigned short netdev_lock_type[] =
ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
- ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
- ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
- ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
- ARPHRD_VOID, ARPHRD_NONE};
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
static const char *const netdev_lock_name[] =
{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
@@ -304,10 +296,9 @@ static const char *const netdev_lock_name[] =
"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
- "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
- "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
- "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
- "_xmit_VOID", "_xmit_NONE"};
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
@@ -432,7 +423,7 @@ void __dev_remove_pack(struct packet_type *pt)
}
}
- printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+ pr_warn("dev_remove_pack: %p not found\n", pt);
out:
spin_unlock(&ptype_lock);
}
@@ -458,6 +449,81 @@ void dev_remove_pack(struct packet_type *pt)
}
EXPORT_SYMBOL(dev_remove_pack);
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+
+ spin_lock(&offload_lock);
+ list_add_rcu(&po->list, head);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
/******************************************************************************
Device Boot-time Settings Routines
@@ -603,11 +669,10 @@ __setup("netdev=", netdev_boot_setup);
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
- hlist_for_each_entry(dev, p, head, name_hlist)
+ hlist_for_each_entry(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
@@ -629,11 +694,10 @@ EXPORT_SYMBOL(__dev_get_by_name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
- struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
- hlist_for_each_entry_rcu(dev, p, head, name_hlist)
+ hlist_for_each_entry_rcu(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
@@ -680,11 +744,10 @@ EXPORT_SYMBOL(dev_get_by_name);
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
- struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex);
- hlist_for_each_entry(dev, p, head, index_hlist)
+ hlist_for_each_entry(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
@@ -705,11 +768,10 @@ EXPORT_SYMBOL(__dev_get_by_index);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
- struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex);
- hlist_for_each_entry_rcu(dev, p, head, index_hlist)
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
@@ -743,13 +805,48 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL(dev_get_by_index);
/**
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ *
+ * The use of raw_seqcount_begin() and cond_resched() before
+ * retrying is required as we want to give the writers a chance
+ * to complete when CONFIG_PREEMPT is not set.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ unsigned int seq;
+
+retry:
+ seq = raw_seqcount_begin(&devnet_rename_seq);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ strcpy(name, dev->name);
+ rcu_read_unlock();
+ if (read_seqcount_retry(&devnet_rename_seq, seq)) {
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold RCU
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -833,21 +930,21 @@ EXPORT_SYMBOL(dev_get_by_flags_rcu);
* to allow sysfs to work. We also disallow any kind of
* whitespace.
*/
-int dev_valid_name(const char *name)
+bool dev_valid_name(const char *name)
{
if (*name == '\0')
- return 0;
+ return false;
if (strlen(name) >= IFNAMSIZ)
- return 0;
+ return false;
if (!strcmp(name, ".") || !strcmp(name, ".."))
- return 0;
+ return false;
while (*name) {
if (*name == '/' || isspace(*name))
- return 0;
+ return false;
name++;
}
- return 1;
+ return true;
}
EXPORT_SYMBOL(dev_valid_name);
@@ -946,18 +1043,30 @@ int dev_alloc_name(struct net_device *dev, const char *name)
}
EXPORT_SYMBOL(dev_alloc_name);
-static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
+static int dev_alloc_name_ns(struct net *net,
+ struct net_device *dev,
+ const char *name)
{
- struct net *net;
+ char buf[IFNAMSIZ];
+ int ret;
- BUG_ON(!dev_net(dev));
- net = dev_net(dev);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+static int dev_get_valid_name(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ BUG_ON(!net);
if (!dev_valid_name(name))
return -EINVAL;
- if (fmt && strchr(name, '%'))
- return dev_alloc_name(dev, name);
+ if (strchr(name, '%'))
+ return dev_alloc_name_ns(net, dev, name);
else if (__dev_get_by_name(net, name))
return -EEXIST;
else if (dev->name != name)
@@ -988,24 +1097,35 @@ int dev_change_name(struct net_device *dev, const char *newname)
if (dev->flags & IFF_UP)
return -EBUSY;
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+ write_seqcount_begin(&devnet_rename_seq);
+
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
+ write_seqcount_end(&devnet_rename_seq);
return 0;
+ }
memcpy(oldname, dev->name, IFNAMSIZ);
- err = dev_get_valid_name(dev, newname, 1);
- if (err < 0)
+ err = dev_get_valid_name(net, dev, newname);
+ if (err < 0) {
+ write_seqcount_end(&devnet_rename_seq);
return err;
+ }
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_seqcount_end(&devnet_rename_seq);
return ret;
}
+ write_seqcount_end(&devnet_rename_seq);
+
+ netdev_adjacent_rename_links(dev, oldname);
+
write_lock_bh(&dev_base_lock);
- hlist_del(&dev->name_hlist);
+ hlist_del_rcu(&dev->name_hlist);
write_unlock_bh(&dev_base_lock);
synchronize_rcu();
@@ -1021,11 +1141,12 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
+ write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
+ memcpy(oldname, newname, IFNAMSIZ);
goto rollback;
} else {
- printk(KERN_ERR
- "%s: name change rollback failed: %d.\n",
+ pr_err("%s: name change rollback failed: %d\n",
dev->name, ret);
}
}
@@ -1043,22 +1164,23 @@ rollback:
*/
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
+ char *new_ifalias;
+
ASSERT_RTNL();
if (len >= IFALIASZ)
return -EINVAL;
if (!len) {
- if (dev->ifalias) {
- kfree(dev->ifalias);
- dev->ifalias = NULL;
- }
+ kfree(dev->ifalias);
+ dev->ifalias = NULL;
return 0;
}
- dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
- if (!dev->ifalias)
+ new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
+ if (!new_ifalias)
return -ENOMEM;
+ dev->ifalias = new_ifalias;
strlcpy(dev->ifalias, alias, len+1);
return len;
@@ -1088,40 +1210,33 @@ EXPORT_SYMBOL(netdev_features_change);
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
EXPORT_SYMBOL(netdev_state_change);
-int netdev_bonding_change(struct net_device *dev, unsigned long event)
-{
- return call_netdevice_notifiers(event, dev);
-}
-EXPORT_SYMBOL(netdev_bonding_change);
-
/**
- * dev_load - load a network module
- * @net: the applicable net namespace
- * @name: name of interface
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
*
- * If a network interface is not present and the process has suitable
- * privileges this function loads the module. If module loading is not
- * available in this kernel then it becomes a nop.
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
*/
-
-void dev_load(struct net *net, const char *name)
+void netdev_notify_peers(struct net_device *dev)
{
- struct net_device *dev;
-
- rcu_read_lock();
- dev = dev_get_by_name_rcu(net, name);
- rcu_read_unlock();
-
- if (!dev && capable(CAP_NET_ADMIN))
- request_module("%s", name);
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ rtnl_unlock();
}
-EXPORT_SYMBOL(dev_load);
+EXPORT_SYMBOL(netdev_notify_peers);
static int __dev_open(struct net_device *dev)
{
@@ -1130,20 +1245,20 @@ static int __dev_open(struct net_device *dev)
ASSERT_RTNL();
- /*
- * Is it even present?
- */
if (!netif_device_present(dev))
return -ENODEV;
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
+ */
+ netpoll_poll_disable(dev);
+
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
if (ret)
return ret;
- /*
- * Call device private open method
- */
set_bit(__LINK_STATE_START, &dev->state);
if (ops->ndo_validate_addr)
@@ -1152,32 +1267,16 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
- /*
- * If it went open OK then:
- */
+ netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- /*
- * Set the flags.
- */
dev->flags |= IFF_UP;
-
- /*
- * Enable NET_DMA
- */
net_dmaengine_get();
-
- /*
- * Initialize multicasting status
- */
dev_set_rx_mode(dev);
-
- /*
- * Wakeup transmit queue engine
- */
dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
}
return ret;
@@ -1199,74 +1298,95 @@ int dev_open(struct net_device *dev)
{
int ret;
- /*
- * Is it already up?
- */
if (dev->flags & IFF_UP)
return 0;
- /*
- * Open device
- */
ret = __dev_open(dev);
if (ret < 0)
return ret;
- /*
- * ... and announce new interface.
- */
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
}
EXPORT_SYMBOL(dev_open);
-static int __dev_close(struct net_device *dev)
+static int __dev_close_many(struct list_head *head)
{
- const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *dev;
ASSERT_RTNL();
might_sleep();
- /*
- * Tell people we are going down, so that they can
- * prepare to death, when device is still operating.
- */
- call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
- clear_bit(__LINK_STATE_START, &dev->state);
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
- /* Synchronize to scheduled poll. We cannot touch poll list,
- * it can be even on different cpu. So just clear netif_running().
- *
- * dev->stop() will invoke napi_disable() on all of it's
- * napi_struct instances on this device.
- */
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ clear_bit(__LINK_STATE_START, &dev->state);
- dev_deactivate(dev);
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
+ */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
+ }
- /*
- * Call the device specific close. This cannot fail.
- * Only if device is UP
- *
- * We allow it to be called even after a DETACH hot-plug
- * event.
- */
- if (ops->ndo_stop)
- ops->ndo_stop(dev);
+ dev_deactivate_many(head);
- /*
- * Device is now down.
- */
+ list_for_each_entry(dev, head, close_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
- dev->flags &= ~IFF_UP;
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
+ */
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
- /*
- * Shutdown NET_DMA
- */
- net_dmaengine_put();
+ dev->flags &= ~IFF_UP;
+ net_dmaengine_put();
+ netpoll_poll_enable(dev);
+ }
+
+ return 0;
+}
+
+static int __dev_close(struct net_device *dev)
+{
+ int retval;
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+
+ return retval;
+}
+
+static int dev_close_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
+ if (!(dev->flags & IFF_UP))
+ list_del_init(&dev->close_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ list_del_init(&dev->close_list);
+ }
return 0;
}
@@ -1282,17 +1402,13 @@ static int __dev_close(struct net_device *dev)
*/
int dev_close(struct net_device *dev)
{
- if (!(dev->flags & IFF_UP))
- return 0;
-
- __dev_close(dev);
-
- /*
- * Tell people we are down
- */
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
- call_netdevice_notifiers(NETDEV_DOWN, dev);
+ if (dev->flags & IFF_UP) {
+ LIST_HEAD(single);
+ list_add(&dev->close_list, &single);
+ dev_close_many(&single);
+ list_del(&single);
+ }
return 0;
}
EXPORT_SYMBOL(dev_close);
@@ -1308,25 +1424,35 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
- dev->ethtool_ops->set_flags) {
- u32 flags = dev->ethtool_ops->get_flags(dev);
- if (flags & ETH_FLAG_LRO) {
- flags &= ~ETH_FLAG_LRO;
- dev->ethtool_ops->set_flags(dev, flags);
- }
- }
- WARN_ON(dev->features & NETIF_F_LRO);
+ /*
+ * If we're trying to disable lro on a vlan device
+ * use the underlying physical device instead
+ */
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
+
+ /* the same for macvlan devices */
+ if (netif_is_macvlan(dev))
+ dev = macvlan_dev_real_dev(dev);
+
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
+
+ if (unlikely(dev->features & NETIF_F_LRO))
+ netdev_WARN(dev, "failed to disable LRO!\n");
}
EXPORT_SYMBOL(dev_disable_lro);
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info;
-static int dev_boot_phase = 1;
+ netdev_notifier_info_init(&info, dev);
+ return nb->notifier_call(nb, val, &info);
+}
-/*
- * Device change register/unregister. These are not inline or static
- * as we export them to the world.
- */
+static int dev_boot_phase = 1;
/**
* register_netdevice_notifier - register a network notifier block
@@ -1357,7 +1483,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
goto unlock;
for_each_net(net) {
for_each_netdev(net, dev) {
- err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
err = notifier_to_errno(err);
if (err)
goto rollback;
@@ -1365,7 +1491,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
if (!(dev->flags & IFF_UP))
continue;
- nb->notifier_call(nb, NETDEV_UP, dev);
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
}
}
@@ -1378,17 +1504,18 @@ rollback:
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev == last)
- break;
+ goto outroll;
if (dev->flags & IFF_UP) {
- nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
- nb->notifier_call(nb, NETDEV_DOWN, dev);
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
- nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
- nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
+outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1402,20 +1529,59 @@ EXPORT_SYMBOL(register_netdevice_notifier);
* register_netdevice_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{
+ struct net_device *dev;
+ struct net *net;
int err;
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+unlock:
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info)
+{
+ ASSERT_RTNL();
+ netdev_notifier_info_init(info, dev);
+ return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
@@ -1426,38 +1592,104 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- ASSERT_RTNL();
- return raw_notifier_call_chain(&netdev_chain, val, dev);
+ struct netdev_notifier_info info;
+
+ return call_netdevice_notifiers_info(val, dev, &info);
}
+EXPORT_SYMBOL(call_netdevice_notifiers);
-/* When > 0 there are consumers of rx skb time stamps */
-static atomic_t netstamp_needed = ATOMIC_INIT(0);
+static struct static_key netstamp_needed __read_mostly;
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call static_key_slow_dec() from irq context
+ * If net_disable_timestamp() is called from irq context, defer the
+ * static_key_slow_dec() calls.
+ */
+static atomic_t netstamp_needed_deferred;
+#endif
void net_enable_timestamp(void)
{
- atomic_inc(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+
+ if (deferred) {
+ while (--deferred)
+ static_key_slow_dec(&netstamp_needed);
+ return;
+ }
+#endif
+ static_key_slow_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
- atomic_dec(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&netstamp_needed_deferred);
+ return;
+ }
+#endif
+ static_key_slow_dec(&netstamp_needed);
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- if (atomic_read(&netstamp_needed))
+ skb->tstamp.tv64 = 0;
+ if (static_key_false(&netstamp_needed))
__net_timestamp(skb);
- else
- skb->tstamp.tv64 = 0;
}
-static inline void net_timestamp_check(struct sk_buff *skb)
+#define net_timestamp_check(COND, SKB) \
+ if (static_key_false(&netstamp_needed)) { \
+ if ((COND) && !(SKB)->tstamp.tv64) \
+ __net_timestamp(SKB); \
+ } \
+
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
{
- if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
- __net_timestamp(skb);
+ unsigned int len;
+
+ if (!(dev->flags & IFF_UP))
+ return false;
+
+ len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+ if (skb->len <= len)
+ return true;
+
+ /* if TSO is enabled, we don't care about the length as the packet
+ * could be forwarded without being segmented before
+ */
+ if (skb_is_gso(skb))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ }
+
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
@@ -1479,23 +1711,33 @@ static inline void net_timestamp_check(struct sk_buff *skb)
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- skb_orphan(skb);
- nf_reset(skb);
-
- if (unlikely(!(dev->flags & IFF_UP) ||
- (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- skb_set_dev(skb, dev);
- skb->tstamp.tv64 = 0;
- skb->pkt_type = PACKET_HOST;
- skb->protocol = eth_type_trans(skb, dev);
- return netif_rx(skb);
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ return -ENOMEM;
+ atomic_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
+{
+ if (!ptype->af_packet_priv || !skb->sk)
+ return false;
+
+ if (ptype->id_match)
+ return ptype->id_match(ptype, skb->sk);
+ else if ((struct sock *)ptype->af_packet_priv == skb->sk)
+ return true;
+
+ return false;
+}
+
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
@@ -1504,13 +1746,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
-
-#ifdef CONFIG_NET_CLS_ACT
- if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
- net_timestamp_set(skb);
-#else
- net_timestamp_set(skb);
-#endif
+ struct sk_buff *skb2 = NULL;
+ struct packet_type *pt_prev = NULL;
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1518,12 +1755,19 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if ((ptype->dev == dev || !ptype->dev) &&
- (ptype->af_packet_priv == NULL ||
- (struct sock *)ptype->af_packet_priv != skb->sk)) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ (!skb_loop_sk(ptype, skb))) {
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
break;
+ net_timestamp_set(skb2);
+
/* skb->nh should be correctly
set by sender, so that the second statement is
just protection against buggy protocols.
@@ -1531,23 +1775,286 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb2);
if (skb_network_header(skb2) < skb2->data ||
- skb2->network_header > skb2->tail) {
- if (net_ratelimit())
- printk(KERN_CRIT "protocol %04x is "
- "buggy, dev %s\n",
- ntohs(skb2->protocol),
- dev->name);
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
skb_reset_network_header(skb2);
}
skb2->transport_header = skb2->network_header;
skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype, skb->dev);
+ pt_prev = ptype;
}
}
+ if (pt_prev)
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
+#ifdef CONFIG_XPS
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int cpu, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ if (dev_maps)
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] == index) {
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ } else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ break;
+ }
+ }
+
+ return map;
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ struct xps_dev_maps *dev_maps;
+ int cpu, i;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = index; i < dev->num_tx_queues; i++) {
+ if (!remove_xps_queue(dev_maps, cpu, i))
+ break;
+ }
+ if (i == dev->num_tx_queues)
+ active = true;
+ }
+
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ for (i = index; i < dev->num_tx_queues; i++)
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map,
+ int cpu, u16 index)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add queue to this CPU's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store queue on this CPU's map */
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_map *map, *new_map;
+ int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
+ int cpu, numa_node_id = -2;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ /* allocate memory for queue storage */
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, mask))
+ continue;
+
+ if (!new_dev_maps)
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+
+ map = expand_xps_map(map, cpu, index);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
+ /* add queue to CPU maps */
+ int pos = 0;
+
+ map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
+#endif
+ } else if (dev_maps) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ }
+
+ rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+
+ /* Cleanup old maps */
+ if (dev_maps) {
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (map && map != new_map)
+ kfree_rcu(map, rcu);
+ }
+
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ? numa_node_id :
+ NUMA_NO_NODE);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes queue from unused CPUs */
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
+ continue;
+
+ if (remove_xps_queue(dev_maps, cpu, index))
+ active = true;
+ }
+
+ /* free map if not active */
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1559,7 +2066,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
- if (dev->reg_state == NETREG_REGISTERED) {
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
@@ -1567,8 +2075,15 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (rc)
return rc;
- if (txq < dev->real_num_tx_queues)
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
+ if (txq < dev->real_num_tx_queues) {
qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
}
dev->real_num_tx_queues = txq;
@@ -1576,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -1609,6 +2124,18 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * This routine should set an upper limit on the number of RSS queues
+ * used by default by multiqueue devices.
+ */
+int netif_get_num_default_rss_queues(void)
+{
+ return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
static inline void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
@@ -1630,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q)
}
EXPORT_SYMBOL(__netif_schedule);
-void dev_kfree_skb_irq(struct sk_buff *skb)
+struct dev_kfree_skb_cb {
+ enum skb_free_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
- if (atomic_dec_and_test(&skb->users)) {
- struct softnet_data *sd;
- unsigned long flags;
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ unsigned long flags;
- local_irq_save(flags);
- sd = &__get_cpu_var(softnet_data);
- skb->next = sd->completion_queue;
- sd->completion_queue = skb;
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
- local_irq_restore(flags);
+ if (likely(atomic_read(&skb->users) == 1)) {
+ smp_rmb();
+ atomic_set(&skb->users, 0);
+ } else if (likely(!atomic_dec_and_test(&skb->users))) {
+ return;
}
+ get_kfree_skb_cb(skb)->reason = reason;
+ local_irq_save(flags);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(dev_kfree_skb_irq);
+EXPORT_SYMBOL(__dev_kfree_skb_irq);
-void dev_kfree_skb_any(struct sk_buff *skb)
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
if (in_irq() || irqs_disabled())
- dev_kfree_skb_irq(skb);
+ __dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
}
-EXPORT_SYMBOL(dev_kfree_skb_any);
+EXPORT_SYMBOL(__dev_kfree_skb_any);
/**
@@ -1687,62 +2226,25 @@ void netif_device_attach(struct net_device *dev)
}
EXPORT_SYMBOL(netif_device_attach);
-static bool can_checksum_protocol(unsigned long features, __be16 protocol)
-{
- return ((features & NETIF_F_NO_CSUM) ||
- ((features & NETIF_F_V4_CSUM) &&
- protocol == htons(ETH_P_IP)) ||
- ((features & NETIF_F_V6_CSUM) &&
- protocol == htons(ETH_P_IPV6)) ||
- ((features & NETIF_F_FCOE_CRC) &&
- protocol == htons(ETH_P_FCOE)));
-}
-
-static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
+static void skb_warn_bad_offload(const struct sk_buff *skb)
{
- __be16 protocol = skb->protocol;
- int features = dev->features;
+ static const netdev_features_t null_features = 0;
+ struct net_device *dev = skb->dev;
+ const char *driver = "";
- if (vlan_tx_tag_present(skb)) {
- features &= dev->vlan_features;
- } else if (protocol == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- features &= dev->vlan_features;
- }
+ if (!net_ratelimit())
+ return;
- return can_checksum_protocol(features, protocol);
-}
+ if (dev && dev->dev.parent)
+ driver = dev_driver_string(dev->dev.parent);
-/**
- * skb_dev_set -- assign a new device to a buffer
- * @skb: buffer for the new device
- * @dev: network device
- *
- * If an skb is owned by a device already, we have to reset
- * all data private to the namespace a device belongs to
- * before assigning it a new device.
- */
-#ifdef CONFIG_NET_NS
-void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
-{
- skb_dst_drop(skb);
- if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
- secpath_reset(skb);
- nf_reset(skb);
- skb_init_secmark(skb);
- skb->mark = 0;
- skb->priority = 0;
- skb->nf_trace = 0;
- skb->ipvs_property = 0;
-#ifdef CONFIG_NET_SCHED
- skb->tc_index = 0;
-#endif
- }
- skb->dev = dev;
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
+ "gso_type=%d ip_summed=%d\n",
+ driver, dev ? &dev->features : &null_features,
+ skb->sk ? &skb->sk->sk_route_caps : &null_features,
+ skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
+ skb_shinfo(skb)->gso_type, skb->ip_summed);
}
-EXPORT_SYMBOL(skb_set_dev);
-#endif /* CONFIG_NET_NS */
/*
* Invalidate hardware checksum when packet is to be mangled, and
@@ -1757,11 +2259,20 @@ int skb_checksum_help(struct sk_buff *skb)
goto out_set_summed;
if (unlikely(skb_shinfo(skb)->gso_size)) {
- /* Let GSO fix up the checksum. */
- goto out_set_summed;
+ skb_warn_bad_offload(skb);
+ return -EINVAL;
}
- offset = skb->csum_start - skb_headroom(skb);
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+
+ offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));
csum = skb_checksum(skb, offset, skb->len - offset, 0);
@@ -1783,69 +2294,85 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/**
- * skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- */
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_type *ptype;
+ unsigned int vlan_depth = skb->mac_len;
__be16 type = skb->protocol;
- int vlan_depth = ETH_HLEN;
- int err;
- while (type == htons(ETH_P_8021Q)) {
- struct vlan_hdr *vh;
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
- if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
- return ERR_PTR(-EINVAL);
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
- vh = (struct vlan_hdr *)(skb->data + vlan_depth);
- type = vh->h_vlan_encapsulated_proto;
- vlan_depth += VLAN_HLEN;
+ eth = (struct ethhdr *)skb_mac_header(skb);
+ type = eth->h_proto;
}
- skb_reset_mac_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
- __skb_pull(skb, skb->mac_len);
+ /* if skb->protocol is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- struct net_device *dev = skb->dev;
- struct ethtool_drvinfo info = {};
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
- if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
- dev->ethtool_ops->get_drvinfo(dev, &info);
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
+ }
- WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
- info.driver, dev ? dev->features : 0L,
- skb->sk ? skb->sk->sk_route_caps : 0L,
- skb->len, skb->data_len, skb->ip_summed);
+ *depth = vlan_depth;
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- return ERR_PTR(err);
- }
+ return type;
+}
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
rcu_read_lock();
- list_for_each_entry_rcu(ptype,
- &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- err = ptype->gso_send_check(skb);
+ int err;
+
+ err = ptype->callbacks.gso_send_check(skb);
segs = ERR_PTR(err);
if (err || skb_gso_ok(skb, features))
break;
__skb_push(skb, (skb->data -
skb_network_header(skb)));
}
- segs = ptype->gso_segment(skb, features);
+ segs = ptype->callbacks.gso_segment(skb, features);
break;
}
}
@@ -1855,15 +2382,59 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
return segs;
}
-EXPORT_SYMBOL(skb_gso_segment);
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL;
+ else
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ skb_warn_bad_offload(skb);
+
+ if (skb_header_cloned(skb) &&
+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return ERR_PTR(err);
+ }
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb_mac_gso_segment(skb, features);
+}
+EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
{
if (net_ratelimit()) {
- printk(KERN_ERR "%s: hw csum failure.\n",
- dev ? dev->name : "<unknown>");
+ pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
dump_stack();
}
}
@@ -1880,9 +2451,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
#ifdef CONFIG_HIGHMEM
int i;
if (!(dev->features & NETIF_F_HIGHDMA)) {
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- if (PageHighMem(skb_shinfo(skb)->frags[i].page))
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (PageHighMem(skb_frag_page(frag)))
return 1;
+ }
}
if (PCI_DMA_BUS_IS_PHYS) {
@@ -1891,7 +2464,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!pdev)
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ dma_addr_t addr = page_to_phys(skb_frag_page(frag));
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
@@ -1910,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
{
struct dev_gso_cb *cb;
- do {
- struct sk_buff *nskb = skb->next;
-
- skb->next = nskb->next;
- nskb->next = NULL;
- kfree_skb(nskb);
- } while (skb->next);
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
cb = DEV_GSO_CB(skb);
if (cb->destructor)
@@ -1926,16 +2495,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
/**
* dev_gso_segment - Perform emulated hardware segmentation on skb.
* @skb: buffer to segment
+ * @features: device features as applicable to this skb
*
* This function segments the given skb and stores the list of segments
* in skb->next.
*/
-static int dev_gso_segment(struct sk_buff *skb)
+static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
- struct net_device *dev = skb->dev;
struct sk_buff *segs;
- int features = dev->features & ~(illegal_highdma(dev, skb) ?
- NETIF_F_SG : 0);
segs = skb_gso_segment(skb, features);
@@ -1953,103 +2520,117 @@ static int dev_gso_segment(struct sk_buff *skb)
return 0;
}
-/*
- * Try to orphan skb early, right before transmission by the device.
- * We cannot orphan skb if tx timestamp is requested or the sk-reference
- * is needed on driver level for other reasons, e.g. see net/can/raw.c
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
*/
-static inline void skb_orphan_try(struct sk_buff *skb)
+#ifdef CONFIG_NET_MPLS_GSO
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
{
- struct sock *sk = skb->sk;
+ if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ features &= skb->dev->mpls_features;
- if (sk && !skb_shinfo(skb)->tx_flags) {
- /* skb_tx_hash() wont be able to get sk.
- * We copy sk_hash into skb->rxhash
- */
- if (!skb->rxhash)
- skb->rxhash = sk->sk_hash;
- skb_orphan(skb);
- }
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
}
+#endif
-int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ netdev_features_t features)
{
- __be16 protocol = skb->protocol;
+ int tmp;
+ __be16 type;
- if (protocol == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
- protocol = veh->h_vlan_encapsulated_proto;
- } else if (!skb->vlan_tci)
- return dev->features;
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
- if (protocol != htons(ETH_P_8021Q))
- return dev->features & dev->vlan_features;
- else
- return 0;
+ if (skb->ip_summed != CHECKSUM_NONE &&
+ !can_checksum_protocol(features, type)) {
+ features &= ~NETIF_F_ALL_CSUM;
+ } else if (illegal_highdma(skb->dev, skb)) {
+ features &= ~NETIF_F_SG;
+ }
+
+ return features;
}
-EXPORT_SYMBOL(netif_get_vlan_features);
-/*
- * Returns true if either:
- * 1. skb has frag_list and the device doesn't support FRAGLIST, or
- * 2. skb is fragmented and the device does not support SG, or if
- * at least one of fragments is in highmem and device does not
- * support DMA from it.
- */
-static inline int skb_needs_linearize(struct sk_buff *skb,
- struct net_device *dev)
+netdev_features_t netif_skb_features(struct sk_buff *skb)
{
- if (skb_is_nonlinear(skb)) {
- int features = dev->features;
+ __be16 protocol = skb->protocol;
+ netdev_features_t features = skb->dev->features;
- if (vlan_tx_tag_present(skb))
- features &= dev->vlan_features;
+ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ features &= ~NETIF_F_GSO_MASK;
- return (skb_has_frag_list(skb) &&
- !(features & NETIF_F_FRAGLIST)) ||
- (skb_shinfo(skb)->nr_frags &&
- (!(features & NETIF_F_SG) ||
- illegal_highdma(dev, skb)));
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+ } else if (!vlan_tx_tag_present(skb)) {
+ return harmonize_features(skb, features);
}
- return 0;
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
+
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
+ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+
+ return harmonize_features(skb, features);
}
+EXPORT_SYMBOL(netif_skb_features);
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
const struct net_device_ops *ops = dev->netdev_ops;
int rc = NETDEV_TX_OK;
+ unsigned int skb_len;
if (likely(!skb->next)) {
+ netdev_features_t features;
+
/*
- * If device doesnt need skb->dst, release it right now while
+ * If device doesn't need skb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
- if (!list_empty(&ptype_all))
- dev_queue_xmit_nit(skb, dev);
-
- skb_orphan_try(skb);
+ features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
- !(dev->features & NETIF_F_HW_VLAN_TX)) {
- skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
skb->vlan_tci = 0;
}
- if (netif_needs_gso(dev, skb)) {
- if (unlikely(dev_gso_segment(skb)))
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (netif_needs_gso(skb, features)) {
+ if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb->next)
goto gso;
} else {
- if (skb_needs_linearize(skb, dev) &&
+ if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
@@ -2058,16 +2639,25 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- skb_set_transport_header(skb, skb->csum_start -
- skb_headroom(skb));
- if (!dev_can_checksum(dev, skb) &&
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
}
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
+ skb_len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
rc = ops->ndo_start_xmit(skb, dev);
- trace_net_dev_xmit(skb, rc);
+ trace_net_dev_xmit(skb, rc, dev, skb_len);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
@@ -2080,15 +2670,13 @@ gso:
skb->next = nskb->next;
nskb->next = NULL;
- /*
- * If device doesnt need nskb->dst, release it right now while
- * its hot in this cpu cache
- */
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(nskb);
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(nskb, dev);
+ skb_len = nskb->len;
+ trace_net_dev_start_xmit(nskb, dev);
rc = ops->ndo_start_xmit(nskb, dev);
- trace_net_dev_xmit(nskb, rc);
+ trace_net_dev_xmit(nskb, rc, dev, skb_len);
if (unlikely(rc != NETDEV_TX_OK)) {
if (rc & ~NETDEV_TX_MASK)
goto out_kfree_gso_skb;
@@ -2097,128 +2685,51 @@ gso:
return rc;
}
txq_trans_update(txq);
- if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+ if (unlikely(netif_xmit_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
out_kfree_gso_skb:
- if (likely(skb->next == NULL))
+ if (likely(skb->next == NULL)) {
skb->destructor = DEV_GSO_CB(skb)->destructor;
+ consume_skb(skb);
+ return rc;
+ }
out_kfree_skb:
kfree_skb(skb);
out:
return rc;
}
+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
-static u32 hashrnd __read_mostly;
-
-u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
+static void qdisc_pkt_len_init(struct sk_buff *skb)
{
- u32 hash;
-
- if (skb_rx_queue_recorded(skb)) {
- hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= dev->real_num_tx_queues))
- hash -= dev->real_num_tx_queues;
- return hash;
- }
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol ^ skb->rxhash;
- hash = jhash_1word(hash, hashrnd);
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
- return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
-}
-EXPORT_SYMBOL(skb_tx_hash);
-
-static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
-{
- if (unlikely(queue_index >= dev->real_num_tx_queues)) {
- if (net_ratelimit()) {
- pr_warning("%s selects TX queue %d, but "
- "real number of TX queues is %d\n",
- dev->name, queue_index, dev->real_num_tx_queues);
- }
- return 0;
- }
- return queue_index;
-}
-
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
-#ifdef CONFIG_XPS
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- int queue_index = -1;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
- if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[raw_smp_processor_id()]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else {
- u32 hash;
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol ^
- skb->rxhash;
- hash = jhash_1word(hash, hashrnd);
- queue_index = map->queues[
- ((u64)hash * map->len) >> 32];
- }
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
- }
- }
- rcu_read_unlock();
-
- return queue_index;
-#else
- return -1;
-#endif
-}
-
-static struct netdev_queue *dev_pick_tx(struct net_device *dev,
- struct sk_buff *skb)
-{
- int queue_index;
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (dev->real_num_tx_queues == 1)
- queue_index = 0;
- else if (ops->ndo_select_queue) {
- queue_index = ops->ndo_select_queue(dev, skb);
- queue_index = dev_cap_txqueue(dev, queue_index);
- } else {
- struct sock *sk = skb->sk;
- queue_index = sk_tx_queue_get(sk);
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (shinfo->gso_size) {
+ unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
- if (queue_index < 0 || skb->ooo_okay ||
- queue_index >= dev->real_num_tx_queues) {
- int old_index = queue_index;
+ /* mac layer + network layer */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
- queue_index = get_xps_queue(dev, skb);
- if (queue_index < 0)
- queue_index = skb_tx_hash(dev, skb);
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ hdr_len += tcp_hdrlen(skb);
+ else
+ hdr_len += sizeof(struct udphdr);
- if (queue_index != old_index && sk) {
- struct dst_entry *dst =
- rcu_dereference_check(sk->sk_dst_cache, 1);
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
- if (dst && skb_dst(skb) == dst)
- sk_tx_queue_set(sk, queue_index);
- }
- }
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
-
- skb_set_queue_mapping(skb, queue_index);
- return netdev_get_tx_queue(dev, queue_index);
}
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
@@ -2226,15 +2737,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
- bool contended = qdisc_is_running(q);
+ bool contended;
int rc;
+ qdisc_pkt_len_init(skb);
+ qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
+ contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -2251,7 +2765,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
- __qdisc_update_bstats(q, skb->len);
+
+ qdisc_bstats_update(q, skb);
+
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2264,7 +2780,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
- rc = qdisc_enqueue_root(skb, q);
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2279,12 +2795,46 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc;
}
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+
+ if (!skb->priority && skb->sk && map) {
+ unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
+ }
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
static DEFINE_PER_CPU(int, xmit_recursion);
#define RECURSION_LIMIT 10
/**
- * dev_queue_xmit - transmit a buffer
+ * dev_loopback_xmit - loop back @skb
* @skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->pkt_type = PACKET_LOOPBACK;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ WARN_ON(!skb_dst(skb));
+ skb_dst_force(skb);
+ netif_rx_ni(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dev_loopback_xmit);
+
+/**
+ * __dev_queue_xmit - transmit a buffer
+ * @skb: buffer to transmit
+ * @accel_priv: private data used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
@@ -2307,19 +2857,23 @@ static DEFINE_PER_CPU(int, xmit_recursion);
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-int dev_queue_xmit(struct sk_buff *skb)
+static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
+ skb_reset_mac_header(skb);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
- txq = dev_pick_tx(dev, skb);
+ skb_update_prio(skb);
+
+ txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
#ifdef CONFIG_NET_CLS_ACT
@@ -2353,7 +2907,7 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
__this_cpu_dec(xmit_recursion);
@@ -2363,37 +2917,49 @@ int dev_queue_xmit(struct sk_buff *skb)
}
}
HARD_TX_UNLOCK(dev, txq);
- if (net_ratelimit())
- printk(KERN_CRIT "Virtual device %s asks to "
- "queue packet!\n", dev->name);
+ net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
+ dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
- if (net_ratelimit())
- printk(KERN_CRIT "Dead loop on virtual device "
- "%s, fix it urgently!\n", dev->name);
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
+ atomic_long_inc(&dev->tx_dropped);
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
+
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ return __dev_queue_xmit(skb, NULL);
+}
EXPORT_SYMBOL(dev_queue_xmit);
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+{
+ return __dev_queue_xmit(skb, accel_priv);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
/*=======================================================================
Receiver routines
=======================================================================*/
int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
@@ -2406,82 +2972,58 @@ static inline void ____napi_schedule(struct softnet_data *sd,
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
-/*
- * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Returns a non-zero hash number on success
- * and 0 on failure.
- */
-__u32 __skb_get_rxhash(struct sk_buff *skb)
-{
- int nhoff, hash = 0, poff;
- struct ipv6hdr *ip6;
- struct iphdr *ip;
- u8 ip_proto;
- u32 addr1, addr2, ihl;
- union {
- u32 v32;
- u16 v16[2];
- } ports;
+#ifdef CONFIG_RPS
- nhoff = skb_network_offset(skb);
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
- switch (skb->protocol) {
- case __constant_htons(ETH_P_IP):
- if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
- goto done;
+struct static_key rps_needed __read_mostly;
- ip = (struct iphdr *) (skb->data + nhoff);
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
- ip_proto = 0;
- else
- ip_proto = ip->protocol;
- addr1 = (__force u32) ip->saddr;
- addr2 = (__force u32) ip->daddr;
- ihl = ip->ihl;
- break;
- case __constant_htons(ETH_P_IPV6):
- if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
- goto done;
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ if (next_cpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
- ip6 = (struct ipv6hdr *) (skb->data + nhoff);
- ip_proto = ip6->nexthdr;
- addr1 = (__force u32) ip6->saddr.s6_addr32[3];
- addr2 = (__force u32) ip6->daddr.s6_addr32[3];
- ihl = (40 >> 2);
- break;
- default:
- goto done;
- }
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
- ports.v32 = 0;
- poff = proto_ports_offset(ip_proto);
- if (poff >= 0) {
- nhoff += ihl * 4 + poff;
- if (pskb_may_pull(skb, nhoff + 4)) {
- ports.v32 = * (__force u32 *) (skb->data + nhoff);
- if (ports.v16[1] < ports.v16[0])
- swap(ports.v16[0], ports.v16[1]);
- }
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, next_cpu).input_queue_head;
}
- /* get a consistent hash (same value on both flow directions) */
- if (addr2 < addr1)
- swap(addr1, addr2);
-
- hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
- if (!hash)
- hash = 1;
-
-done:
- return hash;
+ rflow->cpu = next_cpu;
+ return rflow;
}
-EXPORT_SYMBOL(__skb_get_rxhash);
-
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
@@ -2497,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u16 tcpu;
+ u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
@@ -2513,18 +3056,20 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- if (map->len == 1) {
+ if (map->len == 1 &&
+ !rcu_access_pointer(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
goto done;
}
- } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
+ } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
goto done;
}
skb_reset_network_header(skb);
- if (!skb_get_rxhash(skb))
+ hash = skb_get_hash(skb);
+ if (!hash)
goto done;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
@@ -2533,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
u16 next_cpu;
struct rps_dev_flow *rflow;
- rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[skb->rxhash &
- sock_flow_table->mask];
+ next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
/*
* If the desired CPU (where last recvmsg was done) is
@@ -2554,11 +3098,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU)
- rflow->last_qtail = per_cpu(softnet_data,
- tcpu).input_queue_head;
+ tcpu = next_cpu;
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
+
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
@@ -2567,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) hash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
@@ -2579,6 +3122,46 @@ done:
return cpu;
}
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
@@ -2611,6 +3194,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 0;
}
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = &__get_cpu_var(softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
@@ -2620,13 +3243,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
{
struct softnet_data *sd;
unsigned long flags;
+ unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
- if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -2656,35 +3281,15 @@ enqueue:
return NET_RX_DROP;
}
-/**
- * netif_rx - post buffer to the network code
- * @skb: buffer to post
- *
- * This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
- *
- * return values:
- * NET_RX_SUCCESS (no congestion)
- * NET_RX_DROP (packet was dropped)
- *
- */
-
-int netif_rx(struct sk_buff *skb)
+static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
-
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
- {
+ if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
@@ -2699,24 +3304,47 @@ int netif_rx(struct sk_buff *skb)
rcu_read_unlock();
preempt_enable();
- }
-#else
+ } else
+#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
-#endif
return ret;
}
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ trace_netif_rx_entry(skb);
+
+ return netif_rx_internal(skb);
+}
EXPORT_SYMBOL(netif_rx);
int netif_rx_ni(struct sk_buff *skb)
{
int err;
+ trace_netif_rx_ni_entry(skb);
+
preempt_disable();
- err = netif_rx(skb);
+ err = netif_rx_internal(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
@@ -2742,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h)
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
- trace_kfree_skb(skb, net_tx_action);
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
@@ -2764,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q);
@@ -2774,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)
&q->state)) {
__netif_reschedule(q);
} else {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
@@ -2783,14 +3414,6 @@ static void net_tx_action(struct softirq_action *h)
}
}
-static inline int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
-{
- atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-}
-
#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
(defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
/* This hook is defined here for ATM LANE */
@@ -2804,8 +3427,8 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
* a compare and 2 stores extra right now if we dont have it on
* but have CONFIG_NET_CLS_ACT
- * NOTE: This doesnt stop any functionality; if you dont have
- * the ingress scheduler, you just cant add policies on ingress.
+ * NOTE: This doesn't stop any functionality; if you dont have
+ * the ingress scheduler, you just can't add policies on ingress.
*
*/
static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
@@ -2816,9 +3439,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
struct Qdisc *q;
if (unlikely(MAX_RED_LOOP < ttl++)) {
- if (net_ratelimit())
- pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
- skb->skb_iif, dev->ifindex);
+ net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
+ skb->skb_iif, dev->ifindex);
return TC_ACT_SHOT;
}
@@ -2869,11 +3491,13 @@ out:
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
- * Register a receive hander for a device. This handler will then be
+ * Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
@@ -2884,6 +3508,7 @@ int netdev_rx_handler_register(struct net_device *dev,
if (dev->rx_handler)
return -EBUSY;
+ /* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
@@ -2895,7 +3520,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
- * Unregister a receive hander from a device.
+ * Unregister a receive handler from a device.
*
* The caller must hold the rtnl_mutex.
*/
@@ -2903,113 +3528,71 @@ void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
- rcu_assign_pointer(dev->rx_handler, NULL);
- rcu_assign_pointer(dev->rx_handler_data, NULL);
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
- struct net_device *master)
-{
- if (skb->pkt_type == PACKET_HOST) {
- u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
- memcpy(dest, master->dev_addr, ETH_ALEN);
- }
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
*/
-int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
-
- if (master->priv_flags & IFF_MASTER_ARPMON)
- dev->last_rx = jiffies;
-
- if ((master->priv_flags & IFF_MASTER_ALB) &&
- (master->priv_flags & IFF_BRIDGE_PORT)) {
- /* Do address unmangle. The local destination address
- * will be always the one master has. Provides the right
- * functionality in a bridge.
- */
- skb_bond_set_mac_by_master(skb, master);
- }
-
- if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
- if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
- skb->protocol == __cpu_to_be16(ETH_P_ARP))
- return 0;
-
- if (master->priv_flags & IFF_MASTER_ALB) {
- if (skb->pkt_type != PACKET_BROADCAST &&
- skb->pkt_type != PACKET_MULTICAST)
- return 0;
- }
- if (master->priv_flags & IFF_MASTER_8023AD &&
- skb->protocol == __cpu_to_be16(ETH_P_SLOW))
- return 0;
-
- return 1;
+ switch (skb->protocol) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ return true;
+ default:
+ return false;
}
- return 0;
}
-EXPORT_SYMBOL(__skb_bond_should_drop);
-static int __netif_receive_skb(struct sk_buff *skb)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
- struct net_device *master;
- struct net_device *null_or_orig;
- struct net_device *orig_or_bond;
+ struct net_device *null_or_dev;
+ bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
- if (!netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
- /* if we've gotten here through NAPI, check netpoll */
- if (netpoll_receive_skb(skb))
- return NET_RX_DROP;
-
- if (!skb->skb_iif)
- skb->skb_iif = skb->dev->ifindex;
-
- /*
- * bonding note: skbs received on inactive slaves should only
- * be delivered to pkt handlers that are exact matches. Also
- * the deliver_no_wcard flag will be set. If packet handlers
- * are sensitive to duplicate packets these skbs will need to
- * be dropped at the handler.
- */
- null_or_orig = NULL;
orig_dev = skb->dev;
- master = ACCESS_ONCE(orig_dev->master);
- if (skb->deliver_no_wcard)
- null_or_orig = orig_dev;
- else if (master) {
- if (skb_bond_should_drop(skb, master)) {
- skb->deliver_no_wcard = 1;
- null_or_orig = orig_dev; /* deliver only exact match */
- } else
- skb->dev = master;
- }
- __this_cpu_inc(softnet_data.processed);
skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
pt_prev = NULL;
rcu_read_lock();
+another_round:
+ skb->skb_iif = skb->dev->ifindex;
+
+ __this_cpu_inc(softnet_data.processed);
+
+ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ skb = vlan_untag(skb);
+ if (unlikely(!skb))
+ goto unlock;
+ }
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3017,64 +3600,79 @@ static int __netif_receive_skb(struct sk_buff *skb)
}
#endif
+ if (pfmemalloc)
+ goto skip_taps;
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
- ptype->dev == orig_dev) {
+ if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
+skip_taps:
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
- goto out;
+ goto unlock;
ncls:
#endif
- /* Handle special case of bridge or macvlan */
- rx_handler = rcu_dereference(skb->dev->rx_handler);
- if (rx_handler) {
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ goto drop;
+
+ if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
- skb = rx_handler(skb);
- if (!skb)
- goto out;
+ if (vlan_do_receive(&skb))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto unlock;
}
- if (vlan_tx_tag_present(skb)) {
+ rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
- if (vlan_hwaccel_do_receive(&skb)) {
- ret = __netif_receive_skb(skb);
- goto out;
- } else if (unlikely(!skb))
- goto out;
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
+ goto unlock;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
}
- /*
- * Make sure frames received on VLAN interfaces stacked on
- * bonding interfaces still make their way to any base bonding
- * device that may have registered for a specific ptype. The
- * handler may have to adjust skb->dev and orig_dev.
- */
- orig_or_bond = orig_dev;
- if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
- (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
- orig_or_bond = vlan_dev_real_dev(skb->dev);
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ if (vlan_tx_tag_get_id(skb))
+ skb->pkt_type = PACKET_OTHERHOST;
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ skb->vlan_tci = 0;
}
+ /* deliver only exact match when indicated */
+ null_or_dev = deliver_exact ? skb->dev : NULL;
+
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type && (ptype->dev == null_or_orig ||
- ptype->dev == skb->dev || ptype->dev == orig_dev ||
- ptype->dev == orig_or_bond)) {
+ if (ptype->type == type &&
+ (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -3082,8 +3680,12 @@ ncls:
}
if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ goto drop;
+ else
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
+drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -3092,36 +3694,45 @@ ncls:
ret = NET_RX_DROP;
}
-out:
+unlock:
rcu_read_unlock();
return ret;
}
-/**
- * netif_receive_skb - process receive buffer from network
- * @skb: buffer to process
- *
- * netif_receive_skb() is the main receive data processing function.
- * It always succeeds. The buffer may be dropped during processing
- * for congestion control or by the protocol layers.
- *
- * This function may only be called from softirq context and interrupts
- * should be enabled.
- *
- * Return values (usually ignored):
- * NET_RX_SUCCESS: no congestion
- * NET_RX_DROP: packet was dropped
- */
-int netif_receive_skb(struct sk_buff *skb)
+static int __netif_receive_skb(struct sk_buff *skb)
{
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ int ret;
+
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned long pflags = current->flags;
+
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ current->flags |= PF_MEMALLOC;
+ ret = __netif_receive_skb_core(skb, true);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ } else
+ ret = __netif_receive_skb_core(skb, false);
+
+ return ret;
+}
+
+static int netif_receive_skb_internal(struct sk_buff *skb)
+{
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
#ifdef CONFIG_RPS
- {
+ if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu, ret;
@@ -3132,16 +3743,34 @@ int netif_receive_skb(struct sk_buff *skb)
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- } else {
- rcu_read_unlock();
- ret = __netif_receive_skb(skb);
+ return ret;
}
-
- return ret;
+ rcu_read_unlock();
}
-#else
- return __netif_receive_skb(skb);
#endif
+ return __netif_receive_skb(skb);
+}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ trace_netif_receive_skb_entry(skb);
+
+ return netif_receive_skb_internal(skb);
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -3175,11 +3804,13 @@ static void flush_backlog(void *arg)
static int napi_gro_complete(struct sk_buff *skb)
{
- struct packet_type *ptype;
+ struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+ struct list_head *head = &offload_base;
int err = -ENOENT;
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
if (NAPI_GRO_CB(skb)->count == 1) {
skb_shinfo(skb)->gso_size = 0;
goto out;
@@ -3187,10 +3818,10 @@ static int napi_gro_complete(struct sk_buff *skb)
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || ptype->dev || !ptype->gro_complete)
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
- err = ptype->gro_complete(skb);
+ err = ptype->callbacks.gro_complete(skb, 0);
break;
}
rcu_read_unlock();
@@ -3202,53 +3833,137 @@ static int napi_gro_complete(struct sk_buff *skb)
}
out:
- return netif_receive_skb(skb);
+ return netif_receive_skb_internal(skb);
}
-inline void napi_gro_flush(struct napi_struct *napi)
+/* napi->gro_list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
- struct sk_buff *skb, *next;
+ struct sk_buff *skb, *prev = NULL;
- for (skb = napi->gro_list; skb; skb = next) {
- next = skb->next;
+ /* scan list and build reverse chain */
+ for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
+ skb->prev = prev;
+ prev = skb;
+ }
+
+ for (skb = prev; skb; skb = prev) {
skb->next = NULL;
+
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+
+ prev = skb->prev;
napi_gro_complete(skb);
+ napi->gro_count--;
}
- napi->gro_count = 0;
napi->gro_list = NULL;
}
EXPORT_SYMBOL(napi_gro_flush);
-enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct sk_buff *p;
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+
+ for (p = napi->gro_list; p; p = p->next) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+ pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
- struct packet_type *ptype;
+ struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+ struct list_head *head = &offload_base;
int same_flow;
- int mac_len;
enum gro_result ret;
+ int grow;
- if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+ if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal;
+ gro_list_prepare(napi, skb);
+ NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || ptype->dev || !ptype->gro_receive)
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
- mac_len = skb->network_header - skb->mac_header;
- skb->mac_len = mac_len;
+ skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->udp_mark = 0;
- pp = ptype->gro_receive(&napi->gro_list, skb);
+ pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
}
rcu_read_unlock();
@@ -3271,38 +3986,35 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
if (same_flow)
goto ok;
- if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
+ if (NAPI_GRO_CB(skb)->flush)
goto normal;
- napi->gro_count++;
+ if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
+ struct sk_buff *nskb = napi->gro_list;
+
+ /* locate the end of the list to select the 'oldest' flow */
+ while (nskb->next) {
+ pp = &nskb->next;
+ nskb = *pp;
+ }
+ *pp = NULL;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ } else {
+ napi->gro_count++;
+ }
NAPI_GRO_CB(skb)->count = 1;
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
ret = GRO_HELD;
pull:
- if (skb_headlen(skb) < skb_gro_offset(skb)) {
- int grow = skb_gro_offset(skb) - skb_headlen(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->tail += grow;
- skb->data_len -= grow;
-
- skb_shinfo(skb)->frags[0].page_offset += grow;
- skb_shinfo(skb)->frags[0].size -= grow;
-
- if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
- put_page(skb_shinfo(skb)->frags[0].page);
- memmove(skb_shinfo(skb)->frags,
- skb_shinfo(skb)->frags + 1,
- --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
- }
- }
-
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
ok:
return ret;
@@ -3310,40 +4022,54 @@ normal:
ret = GRO_NORMAL;
goto pull;
}
-EXPORT_SYMBOL(dev_gro_receive);
-static inline gro_result_t
-__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+struct packet_offload *gro_find_receive_by_type(__be16 type)
{
- struct sk_buff *p;
-
- for (p = napi->gro_list; p; p = p->next) {
- unsigned long diffs;
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
- diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= p->vlan_tci ^ skb->vlan_tci;
- diffs |= compare_ether_header(skb_mac_header(p),
- skb_gro_mac_header(skb));
- NAPI_GRO_CB(p)->same_flow = !diffs;
- NAPI_GRO_CB(p)->flush = 0;
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
}
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
- return dev_gro_receive(napi, skb);
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
}
+EXPORT_SYMBOL(gro_find_complete_by_type);
-gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb(skb))
+ if (netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
case GRO_DROP:
- case GRO_MERGED_FREE:
kfree_skb(skb);
break;
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ __kfree_skb(skb);
+ break;
+
case GRO_HELD:
case GRO_MERGED:
break;
@@ -3351,37 +4077,28 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
return ret;
}
-EXPORT_SYMBOL(napi_skb_finish);
-
-void skb_gro_reset_offset(struct sk_buff *skb)
-{
- NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
-
- if (skb->mac_header == skb->tail &&
- !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
- NAPI_GRO_CB(skb)->frag0 =
- page_address(skb_shinfo(skb)->frags[0].page) +
- skb_shinfo(skb)->frags[0].page_offset;
- NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
- }
-}
-EXPORT_SYMBOL(skb_gro_reset_offset);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ trace_napi_gro_receive_entry(skb);
+
skb_gro_reset_offset(skb);
- return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
+ return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
__skb_pull(skb, skb_headlen(skb));
- skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
napi->skb = skb;
}
@@ -3392,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
if (!skb) {
skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
- if (skb)
- napi->skb = skb;
+ napi->skb = skb;
}
return skb;
}
EXPORT_SYMBOL(napi_get_frags);
-gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
-
- if (ret == GRO_HELD)
- skb_gro_pull(skb, -ETH_HLEN);
- else if (netif_receive_skb(skb))
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
@@ -3424,44 +4139,45 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
return ret;
}
-EXPORT_SYMBOL(napi_frags_finish);
-struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
- struct ethhdr *eth;
- unsigned int hlen;
- unsigned int off;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb);
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*eth);
- eth = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- eth = skb_gro_header_slow(skb, hlen, off);
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
napi_reuse_skb(napi, skb);
- skb = NULL;
- goto out;
+ return NULL;
}
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
-
- skb_gro_pull(skb, sizeof(*eth));
+ __skb_pull(skb, hlen);
/*
* This works because the only protocols we care about don't require
- * special handling. We'll fix it up properly at the end.
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
*/
skb->protocol = eth->h_proto;
-out:
return skb;
}
-EXPORT_SYMBOL(napi_frags_skb);
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
@@ -3470,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
if (!skb)
return GRO_DROP;
- return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+ trace_napi_gro_frags_entry(skb);
+
+ return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
}
EXPORT_SYMBOL(napi_gro_frags);
/*
- * net_rps_action sends any pending IPI's for rps.
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
@@ -3493,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
- __smp_call_function_single(remsd->cpu,
- &remsd->csd, 0);
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
remsd = next;
}
} else
@@ -3518,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
#endif
napi->weight = weight_p;
local_irq_disable();
- while (work < quota) {
+ while (1) {
struct sk_buff *skb;
- unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
@@ -3534,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
rps_lock(sd);
- qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen)
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
-
- if (qlen < quota - work) {
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
- * and NAPI_STATE_SCHED is the only possible flag set on backlog.
- * we can use a plain write instead of clear_bit(),
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
+ rps_unlock(sd);
- quota = work + qlen;
+ break;
}
+
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
rps_unlock(sd);
}
local_irq_enable();
@@ -3581,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)
BUG_ON(n->gro_list);
list_del(&n->poll_list);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
@@ -3597,13 +4314,65 @@ void napi_complete(struct napi_struct *n)
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
- napi_gro_flush(n);
+ napi_gro_flush(n, false);
local_irq_save(flags);
__napi_complete(n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(napi_complete);
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(napi_by_id);
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0 is not a valid id, we also skip an id that is taken
+ * we expect both events to be extremely rare
+ */
+ napi->napi_id = 0;
+ while (!napi->napi_id) {
+ napi->napi_id = ++napi_gen_id;
+ if (napi_by_id(napi->napi_id))
+ napi->napi_id = 0;
+ }
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(napi_hash_add);
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+void napi_hash_del(struct napi_struct *napi)
+{
+ spin_lock(&napi_hash_lock);
+
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ hlist_del_rcu(&napi->napi_hash_node);
+
+ spin_unlock(&napi_hash_lock);
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -3612,6 +4381,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->gro_list = NULL;
napi->skb = NULL;
napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
@@ -3625,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add);
void netif_napi_del(struct napi_struct *napi)
{
- struct sk_buff *skb, *next;
-
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- for (skb = napi->gro_list; skb; skb = next) {
- next = skb->next;
- skb->next = NULL;
- kfree_skb(skb);
- }
-
+ kfree_skb_list(napi->gro_list);
napi->gro_list = NULL;
napi->gro_count = 0;
}
@@ -3658,7 +4423,7 @@ static void net_rx_action(struct softirq_action *h)
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
- if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
+ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
@@ -3678,7 +4443,7 @@ static void net_rx_action(struct softirq_action *h)
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
- * accidently calling ->poll() when NAPI is not scheduled.
+ * accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
@@ -3702,8 +4467,17 @@ static void net_rx_action(struct softirq_action *h)
local_irq_enable();
napi_complete(n);
local_irq_disable();
- } else
+ } else {
+ if (n->gro_list) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ local_irq_enable();
+ napi_gro_flush(n, HZ >= 1000);
+ local_irq_disable();
+ }
list_move_tail(&n->poll_list, &sd->poll_list);
+ }
}
netpoll_poll_unlock(have);
@@ -3727,488 +4501,721 @@ softnet_break:
goto out;
}
-static gifconf_func_t *gifconf_list[NPROTO];
+struct netdev_adjacent {
+ struct net_device *dev;
+
+ /* upper master flag, there can only be one master device per list */
+ bool master;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
+
+ /* private field for the users */
+ void *private;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
+ }
+ return NULL;
+}
/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
*
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
*/
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
+ ASSERT_RTNL();
+ return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev);
-/*
- * Map an interface index to its name (SIOCGIFNAME)
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
*/
+static bool netdev_has_any_upper_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
-/*
- * We need this ioctl for efficient implementation of the
- * if_indextoname() function required by the IPv6 API. Without
- * it, we would have to search all the interfaces to find a
- * match. --pb
- */
+ return !list_empty(&dev->all_adj_list.upper);
+}
-static int dev_ifname(struct net *net, struct ifreq __user *arg)
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
+ */
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
- struct net_device *dev;
- struct ifreq ifr;
+ struct netdev_adjacent *upper;
- /*
- * Fetch the caller's info block.
- */
+ ASSERT_RTNL();
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
- if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
- }
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
- strcpy(ifr.ifr_name, dev->name);
- rcu_read_unlock();
+void *netdev_adjacent_get_private(struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
- if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
- return -EFAULT;
- return 0;
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
+
+ return adj->private;
}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
-/*
- * Perform a SIOCGIFCONF call. This structure will change
- * size eventually, and there is nothing I can do about it.
- * Thus we will need a 'compatibility mode'.
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
*/
-
-static int dev_ifconf(struct net *net, char __user *arg)
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
- struct ifconf ifc;
- struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
-
- /*
- * Fetch the caller's info block.
- */
+ struct netdev_adjacent *upper;
- if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
- return -EFAULT;
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
- pos = ifc.ifc_buf;
- len = ifc.ifc_len;
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- /*
- * Loop over the interfaces, and write an info block for each.
- */
-
- total = 0;
- for_each_netdev(net, dev) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
- }
- }
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc.ifc_len = total;
+ *iter = &upper->list;
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
- return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+ return upper->dev;
}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-#ifdef CONFIG_PROC_FS
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
+/**
+ * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
*/
-void *dev_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
+struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
- struct net *net = seq_file_net(seq);
- loff_t off;
- struct net_device *dev;
+ struct netdev_adjacent *upper;
- rcu_read_lock();
- if (!*pos)
- return SEQ_START_TOKEN;
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
- off = 1;
- for_each_netdev_rcu(net, dev)
- if (off++ == *pos)
- return dev;
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- return NULL;
-}
+ if (&upper->list == &dev->all_adj_list.upper)
+ return NULL;
-void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net_device *dev = (v == SEQ_START_TOKEN) ?
- first_net_device(seq_file_net(seq)) :
- next_net_device((struct net_device *)v);
+ *iter = &upper->list;
- ++*pos;
- return rcu_dereference(dev);
+ return upper->dev;
}
+EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
-void dev_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
{
- rcu_read_unlock();
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->private;
}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
-static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
{
- struct rtnl_link_stats64 temp;
- const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+ struct netdev_adjacent *lower;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
- seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
- "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
- dev->name, stats->rx_bytes, stats->rx_packets,
- stats->rx_errors,
- stats->rx_dropped + stats->rx_missed_errors,
- stats->rx_fifo_errors,
- stats->rx_length_errors + stats->rx_over_errors +
- stats->rx_crc_errors + stats->rx_frame_errors,
- stats->rx_compressed, stats->multicast,
- stats->tx_bytes, stats->tx_packets,
- stats->tx_errors, stats->tx_dropped,
- stats->tx_fifo_errors, stats->collisions,
- stats->tx_carrier_errors +
- stats->tx_aborted_errors +
- stats->tx_window_errors +
- stats->tx_heartbeat_errors,
- stats->tx_compressed);
+ *iter = &lower->list;
+
+ return lower->private;
}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
-/*
- * Called from the PROCfs module. This now uses the new arbitrary sized
- * /proc/net interface to create /proc/net/dev
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
*/
-static int dev_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Inter-| Receive "
- " | Transmit\n"
- " face |bytes packets errs drop fifo frame "
- "compressed multicast|bytes packets errs "
- "drop fifo colls carrier compressed\n");
- else
- dev_seq_printf_stats(seq, v);
- return 0;
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
}
+EXPORT_SYMBOL(netdev_lower_get_next);
-static struct softnet_data *softnet_get_online(loff_t *pos)
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
- struct softnet_data *sd = NULL;
+ struct netdev_adjacent *lower;
- while (*pos < nr_cpu_ids)
- if (cpu_online(*pos)) {
- sd = &per_cpu(softnet_data, *pos);
- break;
- } else
- ++*pos;
- return sd;
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
-static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
- return softnet_get_online(pos);
+ struct netdev_adjacent *upper;
+
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
+ return NULL;
}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
-static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
{
- ++*pos;
- return softnet_get_online(pos);
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
}
-
-static void softnet_seq_stop(struct seq_file *seq, void *v)
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
}
-static int softnet_seq_show(struct seq_file *seq, void *v)
+#define netdev_adjacent_is_neigh_list(dev, dev_list) \
+ (dev_list == &dev->adj_list.upper || \
+ dev_list == &dev->adj_list.lower)
+
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
{
- struct softnet_data *sd = v;
+ struct netdev_adjacent *adj;
+ int ret;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr++;
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ dev_hold(adj_dev);
+
+ pr_debug("dev_hold for %s, because of link added from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
+ if (master) {
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
+ }
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- sd->processed, sd->dropped, sd->time_squeeze, 0,
- 0, 0, 0, 0, /* was fastroute */
- sd->cpu_collision, sd->received_rps);
return 0;
-}
-static const struct seq_operations dev_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = dev_seq_show,
-};
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ kfree(adj);
+ dev_put(adj_dev);
-static int dev_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &dev_seq_ops,
- sizeof(struct seq_net_private));
+ return ret;
}
-static const struct file_operations dev_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ struct netdev_adjacent *adj;
-static const struct seq_operations softnet_seq_ops = {
- .start = softnet_seq_start,
- .next = softnet_seq_next,
- .stop = softnet_seq_stop,
- .show = softnet_seq_show,
-};
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
-static int softnet_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &softnet_seq_ops);
-}
+ if (!adj) {
+ pr_err("tried to remove device %s from %s\n",
+ dev->name, adj_dev->name);
+ BUG();
+ }
-static const struct file_operations softnet_seq_fops = {
- .owner = THIS_MODULE,
- .open = softnet_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+ if (adj->ref_nr > 1) {
+ pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
+ adj->ref_nr-1);
+ adj->ref_nr--;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
-static void *ptype_get_idx(loff_t pos)
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
+}
+
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
{
- struct packet_type *pt = NULL;
- loff_t i = 0;
- int t;
+ int ret;
- list_for_each_entry_rcu(pt, &ptype_all, list) {
- if (i == pos)
- return pt;
- ++i;
- }
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
+ master);
+ if (ret)
+ return ret;
- for (t = 0; t < PTYPE_HASH_SIZE; t++) {
- list_for_each_entry_rcu(pt, &ptype_base[t], list) {
- if (i == pos)
- return pt;
- ++i;
- }
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
+ false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ return ret;
}
- return NULL;
+
+ return 0;
}
-static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
+static int __netdev_adjacent_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- rcu_read_lock();
- return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower,
+ NULL, false);
}
-static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list)
{
- struct packet_type *pt;
- struct list_head *nxt;
- int hash;
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+}
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ptype_get_idx(0);
+static void __netdev_adjacent_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower);
+}
- pt = v;
- nxt = pt->list.next;
- if (pt->type == htons(ETH_P_ALL)) {
- if (nxt != &ptype_all)
- goto found;
- hash = 0;
- nxt = ptype_base[0].next;
- } else
- hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
+{
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev);
- while (nxt == &ptype_base[hash]) {
- if (++hash >= PTYPE_HASH_SIZE)
- return NULL;
- nxt = ptype_base[hash].next;
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+ if (ret) {
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ return ret;
}
-found:
- return list_entry(nxt, struct packet_type, list);
+
+ return 0;
}
-static void ptype_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
{
- rcu_read_unlock();
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
}
-static int ptype_seq_show(struct seq_file *seq, void *v)
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master,
+ void *private)
{
- struct packet_type *pt = v;
+ struct netdev_adjacent *i, *j, *to_i, *to_j;
+ int ret = 0;
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Type Device Function\n");
- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
- if (pt->type == htons(ETH_P_ALL))
- seq_puts(seq, "ALL ");
- else
- seq_printf(seq, "%04x", ntohs(pt->type));
+ ASSERT_RTNL();
- seq_printf(seq, " %-8s %pF\n",
- pt->dev ? pt->dev->name : "", pt->func);
+ if (dev == upper_dev)
+ return -EBUSY;
+
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+ return -EBUSY;
+
+ if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
+ return -EEXIST;
+
+ if (master && netdev_master_upper_dev_get(dev))
+ return -EBUSY;
+
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ master);
+ if (ret)
+ return ret;
+
+ /* Now that we linked these devs, make all the upper_dev's
+ * all_adj_list.upper visible to every dev's all_adj_list.lower an
+ * versa, and don't forget the devices itself. All of these
+ * links are non-neighbours.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("Interlinking %s with %s, non-neighbour\n",
+ i->dev->name, j->dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ if (ret)
+ goto rollback_mesh;
+ }
+ }
+
+ /* add dev to every upper_dev's upper device */
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("linking %s's upper device %s with %s\n",
+ upper_dev->name, i->dev->name, dev->name);
+ ret = __netdev_adjacent_dev_link(dev, i->dev);
+ if (ret)
+ goto rollback_upper_mesh;
}
+ /* add upper_dev to every dev's lower device */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ pr_debug("linking %s's lower device %s with %s\n", dev->name,
+ i->dev->name, upper_dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ if (ret)
+ goto rollback_lower_mesh;
+ }
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
return 0;
-}
-static const struct seq_operations ptype_seq_ops = {
- .start = ptype_seq_start,
- .next = ptype_seq_next,
- .stop = ptype_seq_stop,
- .show = ptype_seq_show,
-};
+rollback_lower_mesh:
+ to_i = i;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ }
+
+ i = NULL;
-static int ptype_seq_open(struct inode *inode, struct file *file)
+rollback_upper_mesh:
+ to_i = i;
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+ }
+
+ i = j = NULL;
+
+rollback_mesh:
+ to_i = i;
+ to_j = j;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i && j == to_j)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ }
+ if (i == to_i)
+ break;
+ }
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
+}
+
+/**
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- return seq_open_net(inode, file, &ptype_seq_ops,
- sizeof(struct seq_net_private));
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
}
+EXPORT_SYMBOL(netdev_upper_dev_link);
-static const struct file_operations ptype_seq_fops = {
- .owner = THIS_MODULE,
- .open = ptype_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
+/**
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
+ */
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
+int netdev_master_upper_dev_link_private(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, private);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
-static int __net_init dev_proc_net_init(struct net *net)
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
{
- int rc = -ENOMEM;
+ struct netdev_adjacent *i, *j;
+ ASSERT_RTNL();
- if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
- goto out;
- if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
- goto out_dev;
- if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
- goto out_softnet;
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
- if (wext_proc_init(net))
- goto out_ptype;
- rc = 0;
-out:
- return rc;
-out_ptype:
- proc_net_remove(net, "ptype");
-out_softnet:
- proc_net_remove(net, "softnet_stat");
-out_dev:
- proc_net_remove(net, "dev");
- goto out;
+ /* Here is the tricky part. We must remove all dev's lower
+ * devices from all upper_dev's upper devices and vice
+ * versa, to maintain the graph relationship.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+ /* remove also the devices itself from lower/upper device
+ * list
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
}
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
-static void __net_exit dev_proc_net_exit(struct net *net)
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
- wext_proc_exit(net);
+ struct netdev_adjacent *iter;
- proc_net_remove(net, "ptype");
- proc_net_remove(net, "softnet_stat");
- proc_net_remove(net, "dev");
-}
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
-static struct pernet_operations __net_initdata dev_proc_ops = {
- .init = dev_proc_net_init,
- .exit = dev_proc_net_exit,
-};
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
-static int __init dev_proc_init(void)
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
{
- return register_pernet_subsys(&dev_proc_ops);
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
}
-#else
-#define dev_proc_init() 0
-#endif /* CONFIG_PROC_FS */
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
-/**
- * netdev_set_master - set up master/slave pair
- * @slave: slave device
- * @master: new master device
- *
- * Changes the master device of the slave. Pass %NULL to break the
- * bonding. The caller must hold the RTNL semaphore. On a failure
- * a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
- */
-int netdev_set_master(struct net_device *slave, struct net_device *master)
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
{
- struct net_device *old = slave->master;
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
ASSERT_RTNL();
- if (master) {
- if (old)
- return -EBUSY;
- dev_hold(master);
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
}
- slave->master = master;
+ if (type_check(dev))
+ max_nest++;
- if (old) {
- synchronize_net();
- dev_put(old);
- }
- if (master)
- slave->flags |= IFF_SLAVE;
- else
- slave->flags &= ~IFF_SLAVE;
-
- rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
- return 0;
+ return max_nest;
}
-EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(dev_get_nest_level);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
- if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
+ if (ops->ndo_change_rx_flags)
ops->ndo_change_rx_flags(dev, flags);
}
-static int __dev_set_promiscuity(struct net_device *dev, int inc)
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
- unsigned short old_flags = dev->flags;
- uid_t uid;
- gid_t gid;
+ unsigned int old_flags = dev->flags;
+ kuid_t uid;
+ kgid_t gid;
ASSERT_RTNL();
@@ -4223,16 +5230,15 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
dev->flags &= ~IFF_PROMISC;
else {
dev->promiscuity -= inc;
- printk(KERN_WARNING "%s: promiscuity touches roof, "
- "set promiscuity failed, promiscuity feature "
- "of device might be broken.\n", dev->name);
+ pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
+ dev->name);
return -EOVERFLOW;
}
}
if (dev->flags != old_flags) {
- printk(KERN_INFO "device %s %s promiscuous mode\n",
- dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
- "left");
+ pr_info("device %s %s promiscuous mode\n",
+ dev->name,
+ dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
current_uid_gid(&uid, &gid);
audit_log(current->audit_context, GFP_ATOMIC,
@@ -4240,13 +5246,16 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
dev->name, (dev->flags & IFF_PROMISC),
(old_flags & IFF_PROMISC),
- audit_get_loginuid(current),
- uid, gid,
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
audit_get_sessionid(current));
}
dev_change_rx_flags(dev, IFF_PROMISC);
}
+ if (notify)
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC);
return 0;
}
@@ -4263,10 +5272,10 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
*/
int dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int err;
- err = __dev_set_promiscuity(dev, inc);
+ err = __dev_set_promiscuity(dev, inc, true);
if (err < 0)
return err;
if (dev->flags != old_flags)
@@ -4275,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
}
EXPORT_SYMBOL(dev_set_promiscuity);
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
+static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
ASSERT_RTNL();
@@ -4305,18 +5301,38 @@ int dev_set_allmulti(struct net_device *dev, int inc)
dev->flags &= ~IFF_ALLMULTI;
else {
dev->allmulti -= inc;
- printk(KERN_WARNING "%s: allmulti touches roof, "
- "set allmulti failed, allmulti feature of "
- "device might be broken.\n", dev->name);
+ pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
+ dev->name);
return -EOVERFLOW;
}
}
if (dev->flags ^ old_flags) {
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags);
}
return 0;
}
+
+/**
+ * dev_set_allmulti - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ return __dev_set_allmulti(dev, inc, true);
+}
EXPORT_SYMBOL(dev_set_allmulti);
/*
@@ -4336,23 +5352,21 @@ void __dev_set_rx_mode(struct net_device *dev)
if (!netif_device_present(dev))
return;
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
- else {
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1);
- dev->uc_promisc = 1;
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1);
- dev->uc_promisc = 0;
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
}
-
- if (ops->ndo_set_multicast_list)
- ops->ndo_set_multicast_list(dev);
}
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
@@ -4368,9 +5382,9 @@ void dev_set_rx_mode(struct net_device *dev)
*
* Get the combination of flag bits exported through APIs to userspace.
*/
-unsigned dev_get_flags(const struct net_device *dev)
+unsigned int dev_get_flags(const struct net_device *dev)
{
- unsigned flags;
+ unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
@@ -4395,7 +5409,7 @@ EXPORT_SYMBOL(dev_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
@@ -4435,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ unsigned int old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
- dev_set_promiscuity(dev, inc);
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
@@ -4448,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- dev_set_allmulti(dev, inc);
+ __dev_set_allmulti(dev, inc, false);
}
return ret;
}
-void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges)
{
unsigned int changes = dev->flags ^ old_flags;
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
call_netdevice_notifiers(NETDEV_UP, dev);
@@ -4466,8 +5488,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
}
if (dev->flags & IFF_UP &&
- (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = changes;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ }
}
/**
@@ -4478,24 +5505,32 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int ret, changes;
- int old_flags = dev->flags;
+ int ret;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
return ret;
- changes = old_flags ^ dev->flags;
- if (changes)
- rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
-
- __dev_notify_flags(dev, old_flags);
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes);
return ret;
}
EXPORT_SYMBOL(dev_change_flags);
+static int __dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
/**
* dev_set_mtu - Change maximum transfer unit
* @dev: device
@@ -4505,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags);
*/
int dev_set_mtu(struct net_device *dev, int new_mtu)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- int err;
+ int err, orig_mtu;
if (new_mtu == dev->mtu)
return 0;
@@ -4518,19 +5552,41 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (!netif_device_present(dev))
return -ENODEV;
- err = 0;
- if (ops->ndo_change_mtu)
- err = ops->ndo_change_mtu(dev, new_mtu);
- else
- dev->mtu = new_mtu;
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ orig_mtu = dev->mtu;
+ err = __dev_set_mtu(dev, new_mtu);
- if (!err && dev->flags & IFF_UP)
- call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ if (!err) {
+ err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __dev_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ }
+ }
return err;
}
EXPORT_SYMBOL(dev_set_mtu);
/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
@@ -4549,365 +5605,51 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
if (!netif_device_present(dev))
return -ENODEV;
err = ops->ndo_set_mac_address(dev, sa);
- if (!err)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
- return err;
+ if (err)
+ return err;
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);
-/*
- * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
- */
-static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
-{
- int err;
- struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
-
- if (!dev)
- return -ENODEV;
-
- switch (cmd) {
- case SIOCGIFFLAGS: /* Get interface flags */
- ifr->ifr_flags = (short) dev_get_flags(dev);
- return 0;
-
- case SIOCGIFMETRIC: /* Get the metric on the interface
- (currently unused) */
- ifr->ifr_metric = 0;
- return 0;
-
- case SIOCGIFMTU: /* Get the MTU of a device */
- ifr->ifr_mtu = dev->mtu;
- return 0;
-
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
- case SIOCGIFSLAVE:
- err = -EINVAL;
- break;
-
- case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
-
- case SIOCGIFINDEX:
- ifr->ifr_ifindex = dev->ifindex;
- return 0;
-
- case SIOCGIFTXQLEN:
- ifr->ifr_qlen = dev->tx_queue_len;
- return 0;
-
- default:
- /* dev_ioctl() should ensure this case
- * is never reached
- */
- WARN_ON(1);
- err = -EINVAL;
- break;
-
- }
- return err;
-}
-
-/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+/**
+ * dev_change_carrier - Change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
*/
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
- int err;
- struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
- const struct net_device_ops *ops;
-
- if (!dev)
- return -ENODEV;
-
- ops = dev->netdev_ops;
-
- switch (cmd) {
- case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
-
- case SIOCSIFMETRIC: /* Set the metric on the interface
- (currently unused) */
- return -EOPNOTSUPP;
-
- case SIOCSIFMTU: /* Set the MTU of a device */
- return dev_set_mtu(dev, ifr->ifr_mtu);
-
- case SIOCSIFHWADDR:
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
-
- case SIOCSIFHWBROADCAST:
- if (ifr->ifr_hwaddr.sa_family != dev->type)
- return -EINVAL;
- memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
- return 0;
+ const struct net_device_ops *ops = dev->netdev_ops;
- case SIOCSIFMAP:
- if (ops->ndo_set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return ops->ndo_set_config(dev, &ifr->ifr_map);
- }
+ if (!ops->ndo_change_carrier)
return -EOPNOTSUPP;
-
- case SIOCADDMULTI:
- if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
-
- case SIOCDELMULTI:
- if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
-
- case SIOCSIFTXQLEN:
- if (ifr->ifr_qlen < 0)
- return -EINVAL;
- dev->tx_queue_len = ifr->ifr_qlen;
- return 0;
-
- case SIOCSIFNAME:
- ifr->ifr_newname[IFNAMSIZ-1] = '\0';
- return dev_change_name(dev, ifr->ifr_newname);
-
- /*
- * Unknown or private ioctl
- */
- default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
- cmd == SIOCBONDRELEASE ||
- cmd == SIOCBONDSETHWADDR ||
- cmd == SIOCBONDSLAVEINFOQUERY ||
- cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCSHWTSTAMP ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (ops->ndo_do_ioctl) {
- if (netif_device_present(dev))
- err = ops->ndo_do_ioctl(dev, ifr, cmd);
- else
- err = -ENODEV;
- }
- } else
- err = -EINVAL;
-
- }
- return err;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
}
-
-/*
- * This function handles all "interface"-type I/O control requests. The actual
- * 'doing' part of this is dev_ifsioc above.
- */
+EXPORT_SYMBOL(dev_change_carrier);
/**
- * dev_ioctl - network device ioctl
- * @net: the applicable net namespace
- * @cmd: command to issue
- * @arg: pointer to a struct ifreq in user space
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
*
- * Issue ioctl functions to devices. This is normally called by the
- * user space syscall interfaces but can sometimes be useful for
- * other purposes. The return value is the return from the syscall if
- * positive or a negative errno code on error.
+ * Get device physical port ID
*/
-
-int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_port_id *ppid)
{
- struct ifreq ifr;
- int ret;
- char *colon;
-
- /* One special case: SIOCGIFCONF takes ifconf argument
- and requires shared lock, because it sleeps writing
- to user space.
- */
-
- if (cmd == SIOCGIFCONF) {
- rtnl_lock();
- ret = dev_ifconf(net, (char __user *) arg);
- rtnl_unlock();
- return ret;
- }
- if (cmd == SIOCGIFNAME)
- return dev_ifname(net, (struct ifreq __user *)arg);
-
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
-
- ifr.ifr_name[IFNAMSIZ-1] = 0;
-
- colon = strchr(ifr.ifr_name, ':');
- if (colon)
- *colon = 0;
-
- /*
- * See which interface the caller is talking about.
- */
-
- switch (cmd) {
- /*
- * These ioctl calls:
- * - can be done by all.
- * - atomic and do not require locking.
- * - return a value
- */
- case SIOCGIFFLAGS:
- case SIOCGIFMETRIC:
- case SIOCGIFMTU:
- case SIOCGIFHWADDR:
- case SIOCGIFSLAVE:
- case SIOCGIFMAP:
- case SIOCGIFINDEX:
- case SIOCGIFTXQLEN:
- dev_load(net, ifr.ifr_name);
- rcu_read_lock();
- ret = dev_ifsioc_locked(net, &ifr, cmd);
- rcu_read_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- case SIOCETHTOOL:
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ethtool(net, &ifr);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - return a value
- */
- case SIOCGMIIPHY:
- case SIOCGMIIREG:
- case SIOCSIFNAME:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - do not return a value
- */
- case SIOCSIFFLAGS:
- case SIOCSIFMETRIC:
- case SIOCSIFMTU:
- case SIOCSIFMAP:
- case SIOCSIFHWADDR:
- case SIOCSIFSLAVE:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCSIFHWBROADCAST:
- case SIOCSIFTXQLEN:
- case SIOCSMIIREG:
- case SIOCBONDENSLAVE:
- case SIOCBONDRELEASE:
- case SIOCBONDSETHWADDR:
- case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- case SIOCSHWTSTAMP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- /* fall through */
- case SIOCBONDSLAVEINFOQUERY:
- case SIOCBONDINFOQUERY:
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- return ret;
-
- case SIOCGIFMEM:
- /* Get the per device memory space. We can add this but
- * currently do not support it */
- case SIOCSIFMEM:
- /* Set the per device memory buffer space.
- * Not applicable in our case */
- case SIOCSIFLINK:
- return -EINVAL;
+ const struct net_device_ops *ops = dev->netdev_ops;
- /*
- * Unknown or private ioctl.
- */
- default:
- if (cmd == SIOCWANDEV ||
- (cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15)) {
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- if (!ret && copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
- }
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
- return wext_handle_ioctl(net, &ifr, cmd, arg);
- return -EINVAL;
- }
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
}
-
+EXPORT_SYMBOL(dev_get_phys_port_id);
/**
* dev_new_index - allocate an ifindex
@@ -4919,26 +5661,29 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
*/
static int dev_new_index(struct net *net)
{
- static int ifindex;
+ int ifindex = net->ifindex;
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
if (!__dev_get_by_index(net, ifindex))
- return ifindex;
+ return net->ifindex = ifindex;
}
}
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
}
static void rollback_registered_many(struct list_head *head)
{
struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
@@ -4949,19 +5694,23 @@ static void rollback_registered_many(struct list_head *head)
* devices and proceed with the remaining.
*/
if (dev->reg_state == NETREG_UNINITIALIZED) {
- pr_debug("unregister_netdevice: device %s/%p never "
- "was registered\n", dev->name, dev);
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
WARN_ON(1);
list_del(&dev->unreg_list);
continue;
}
-
+ dev->dismantle = true;
BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
- /* If device is running, close it first. */
- dev_close(dev);
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head);
+ list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
@@ -4980,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
-
/*
* Flush the unicast and multicast chains
*/
@@ -4993,18 +5738,22 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
- /* Notifier chain MUST detach us from master device. */
- WARN_ON(dev->master);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
}
- /* Process any work delayed until the end of the batch */
- dev = list_first_entry(head, struct net_device, unreg_list);
- call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
-
- rcu_barrier();
+ synchronize_net();
list_for_each_entry(dev, head, unreg_list)
dev_put(dev);
@@ -5016,50 +5765,144 @@ static void rollback_registered(struct net_device *dev)
list_add(&dev->unreg_list, &single);
rollback_registered_many(&single);
+ list_del(&single);
}
-unsigned long netdev_fix_features(unsigned long features, const char *name)
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
- /* Fix illegal SG+CSUM combinations. */
- if ((features & NETIF_F_SG) &&
- !(features & NETIF_F_ALL_CSUM)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
- "checksum feature.\n", name);
- features &= ~NETIF_F_SG;
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
/* TSO requires that SG is present as well. */
- if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
- "SG feature.\n", name);
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
+
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
+
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
}
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
+
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
if (!((features & NETIF_F_GEN_CSUM) ||
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no checksum offload features.\n",
- name);
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
}
if (!(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no NETIF_F_SG feature.\n", name);
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
features &= ~NETIF_F_UFO;
}
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
+
return features;
}
-EXPORT_SYMBOL(netdev_fix_features);
+
+int __netdev_update_features(struct net_device *dev)
+{
+ netdev_features_t features;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return 0;
+
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (unlikely(err < 0)) {
+ netdev_err(dev,
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
+ return -1;
+ }
+
+ if (!err)
+ dev->features = features;
+
+ return 1;
+}
+
+/**
+ * netdev_update_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications if it
+ * has changed. Should be called after driver or hardware dependent
+ * conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+ if (__netdev_update_features(dev))
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
+/**
+ * netdev_change_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications even
+ * if they have not changed. Should be called instead of
+ * netdev_update_features() if also dev->vlan_features might
+ * have changed to allow the changes to be propagated to stacked
+ * VLAN devices.
+ */
+void netdev_change_features(struct net_device *dev)
+{
+ __netdev_update_features(dev);
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
/**
* netif_stacked_transfer_operstate - transfer operstate
@@ -5088,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
@@ -5097,10 +5940,9 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx) {
- pr_err("netdev: Unable to allocate %u rx queues.\n", count);
+ if (!rx)
return -ENOMEM;
- }
+
dev->_rx = rx;
for (i = 0; i < count; i++)
@@ -5116,22 +5958,31 @@ static void netdev_init_one_queue(struct net_device *dev,
spin_lock_init(&queue->_xmit_lock);
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
queue->xmit_lock_owner = -1;
- netdev_queue_numa_node_write(queue, -1);
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
+}
+
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
}
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
- BUG_ON(count < 1);
+ BUG_ON(count < 1 || count > 0xffff);
- tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
+ tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!tx) {
- pr_err("netdev: Unable to allocate %u tx queues.\n",
- count);
- return -ENOMEM;
+ tx = vzalloc(sz);
+ if (!tx)
+ return -ENOMEM;
}
dev->_tx = tx;
@@ -5177,6 +6028,10 @@ int register_netdevice(struct net_device *dev)
dev->iflink = -1;
+ ret = dev_get_valid_name(net, dev, dev->name);
+ if (ret < 0)
+ goto out;
+
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
@@ -5187,40 +6042,46 @@ int register_netdevice(struct net_device *dev)
}
}
- ret = dev_get_valid_name(dev, dev->name, 0);
- if (ret)
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
+ ret = -EINVAL;
+ goto err_uninit;
+ }
+
+ ret = -EBUSY;
+ if (!dev->ifindex)
+ dev->ifindex = dev_new_index(net);
+ else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
- dev->ifindex = dev_new_index(net);
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
- /* Fix illegal checksum combinations */
- if ((dev->features & NETIF_F_HW_CSUM) &&
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = dev->features & dev->hw_features;
- if ((dev->features & NETIF_F_NO_CSUM) &&
- (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ if (!(dev->flags & IFF_LOOPBACK)) {
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
}
- dev->features = netdev_fix_features(dev->features, dev->name);
+ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
+ */
+ dev->vlan_features |= NETIF_F_HIGHDMA;
- /* Enable software GSO if SG is supported. */
- if (dev->features & NETIF_F_SG)
- dev->features |= NETIF_F_GSO;
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG;
- /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
- * vlan_dev_init() will do the dev->features check, so these features
- * are enabled only if supported by underlying device.
+ /* Make NETIF_F_SG inheritable to MPLS.
*/
- dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
+ dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
@@ -5232,6 +6093,8 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
+ __netdev_update_features(dev);
+
/*
* Default initial state at registry is that the
* device is present.
@@ -5239,9 +6102,19 @@ int register_netdevice(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
+ linkwatch_init_dev(dev);
+
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
@@ -5256,7 +6129,7 @@ int register_netdevice(struct net_device *dev)
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
@@ -5327,19 +6200,7 @@ int register_netdev(struct net_device *dev)
int err;
rtnl_lock();
-
- /*
- * If the name is a format string the caller wants us to do a
- * name allocation.
- */
- if (strchr(dev->name, '%')) {
- err = dev_alloc_name(dev, dev->name);
- if (err < 0)
- goto out;
- }
-
err = register_netdevice(dev);
-out:
rtnl_unlock();
return err;
}
@@ -5355,8 +6216,9 @@ int netdev_refcnt_read(const struct net_device *dev)
}
EXPORT_SYMBOL(netdev_refcnt_read);
-/*
+/**
* netdev_wait_allrefs - wait until all references are gone.
+ * @dev: target net_device
*
* This is called when unregistering network devices.
*
@@ -5382,9 +6244,12 @@ static void netdev_wait_allrefs(struct net_device *dev)
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
- * should have already handle it the first time */
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
@@ -5406,10 +6271,8 @@ static void netdev_wait_allrefs(struct net_device *dev)
refcnt = netdev_refcnt_read(dev);
if (time_after(jiffies, warning_time + 10 * HZ)) {
- printk(KERN_EMERG "unregister_netdevice: "
- "waiting for %s to become free. Usage "
- "count = %d\n",
- dev->name, refcnt);
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -5448,13 +6311,22 @@ void netdev_run_todo(void)
__rtnl_unlock();
+
+ /* Wait for rcu callbacks to finish before next phase */
+ if (!list_empty(&list))
+ rcu_barrier();
+
while (!list_empty(&list)) {
struct net_device *dev
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ __rtnl_unlock();
+
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- printk(KERN_ERR "network todo '%s' but state %d\n",
+ pr_err("network todo '%s' but state %d\n",
dev->name, dev->reg_state);
dump_stack();
continue;
@@ -5468,55 +6340,33 @@ void netdev_run_todo(void)
/* paranoia */
BUG_ON(netdev_refcnt_read(dev));
- WARN_ON(rcu_dereference_raw(dev->ip_ptr));
- WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
if (dev->destructor)
dev->destructor(dev);
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
/* Free network device */
kobject_put(&dev->dev.kobj);
}
}
-/**
- * dev_txq_stats_fold - fold tx_queues stats
- * @dev: device to get statistics from
- * @stats: struct rtnl_link_stats64 to hold results
- */
-void dev_txq_stats_fold(const struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
- unsigned int i;
- struct netdev_queue *txq;
-
- for (i = 0; i < dev->num_tx_queues; i++) {
- txq = netdev_get_tx_queue(dev, i);
- spin_lock_bh(&txq->_xmit_lock);
- tx_bytes += txq->tx_bytes;
- tx_packets += txq->tx_packets;
- tx_dropped += txq->tx_dropped;
- spin_unlock_bh(&txq->_xmit_lock);
- }
- if (tx_bytes || tx_packets || tx_dropped) {
- stats->tx_bytes = tx_bytes;
- stats->tx_packets = tx_packets;
- stats->tx_dropped = tx_dropped;
- }
-}
-EXPORT_SYMBOL(dev_txq_stats_fold);
-
/* Convert net_device_stats to rtnl_link_stats64. They have the same
* fields in the same order, with only the type differing.
*/
-static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
- const struct net_device_stats *netdev_stats)
+void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
- memcpy(stats64, netdev_stats, sizeof(*stats64));
+ BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ memcpy(stats64, netdev_stats, sizeof(*stats64));
#else
size_t i, n = sizeof(*stats64) / sizeof(u64);
const unsigned long *src = (const unsigned long *)netdev_stats;
@@ -5528,6 +6378,7 @@ static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
dst[i] = src[i];
#endif
}
+EXPORT_SYMBOL(netdev_stats_to_stats64);
/**
* dev_get_stats - get network device statistics
@@ -5551,9 +6402,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else {
netdev_stats_to_stats64(storage, &dev->stats);
- dev_txq_stats_fold(dev, storage);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -5576,19 +6427,38 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
return queue;
}
+static const struct ethtool_ops default_ethtool_ops;
+
+void netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops)
+{
+ if (dev->ethtool_ops == &default_ethtool_ops)
+ dev->ethtool_ops = ops;
+}
+EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ kvfree(addr);
+}
+
/**
- * alloc_netdev_mq - allocate network device
+ * alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
- * @queue_count: the number of subqueues to allocate
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization. Also allocates subquue structs
- * for each queue on the device at the end of the netdevice.
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *), unsigned int queue_count)
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
@@ -5596,12 +6466,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
BUG_ON(strlen(name) >= sizeof(dev->name));
- if (queue_count < 1) {
- pr_err("alloc_netdev: Unable to allocate device "
- "with zero queues.\n");
+ if (txqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
+#ifdef CONFIG_SYSFS
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
+ return NULL;
+ }
+#endif
+
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
@@ -5611,18 +6487,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kzalloc(alloc_size, GFP_KERNEL);
- if (!p) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
+ p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!p)
+ p = vzalloc(alloc_size);
+ if (!p)
return NULL;
- }
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
- goto free_p;
+ goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
@@ -5632,42 +6508,49 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
- dev->num_tx_queues = queue_count;
- dev->real_num_tx_queues = queue_count;
- if (netif_alloc_netdev_queues(dev))
- goto free_pcpu;
-
-#ifdef CONFIG_RPS
- dev->num_rx_queues = queue_count;
- dev->real_num_rx_queues = queue_count;
- if (netif_alloc_rx_queues(dev))
- goto free_pcpu;
-#endif
-
dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
- INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
- dev->ethtool_ntuple_list.count = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->all_adj_list.upper);
+ INIT_LIST_HEAD(&dev->all_adj_list.lower);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+#ifdef CONFIG_SYSFS
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+#endif
+
strcpy(dev->name, name);
+ dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
return dev;
+free_all:
+ free_netdev(dev);
+ return NULL;
+
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- kfree(dev->_tx);
-#ifdef CONFIG_RPS
- kfree(dev->_rx);
-#endif
-
-free_p:
- kfree(p);
+free_dev:
+ netdev_freemem(dev);
return NULL;
}
-EXPORT_SYMBOL(alloc_netdev_mq);
+EXPORT_SYMBOL(alloc_netdev_mqs);
/**
* free_netdev - free network device
@@ -5683,19 +6566,16 @@ void free_netdev(struct net_device *dev)
release_net(dev_net(dev));
- kfree(dev->_tx);
-#ifdef CONFIG_RPS
+ netif_free_tx_queues(dev);
+#ifdef CONFIG_SYSFS
kfree(dev->_rx);
#endif
- kfree(rcu_dereference_raw(dev->ingress_queue));
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
/* Flush device addresses */
dev_addr_flush(dev);
- /* Clear ethtool n-tuple list */
- ethtool_ntuple_flush(dev);
-
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
@@ -5704,7 +6584,7 @@ void free_netdev(struct net_device *dev)
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
return;
}
@@ -5725,7 +6605,10 @@ EXPORT_SYMBOL(free_netdev);
void synchronize_net(void)
{
might_sleep();
- synchronize_rcu();
+ if (rtnl_is_locked())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
@@ -5759,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -5768,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)
rollback_registered_many(head);
list_for_each_entry(dev, head, unreg_list)
net_set_todo(dev);
+ list_del(head);
}
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -5817,7 +6704,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
goto out;
/* Ensure the device has been registrered */
- err = -EINVAL;
if (dev->reg_state != NETREG_REGISTERED)
goto out;
@@ -5834,7 +6720,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- if (dev_get_valid_name(dev, pat, 1))
+ if (dev_get_valid_name(net, dev, pat) < 0)
goto out;
}
@@ -5862,7 +6748,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+ rcu_barrier();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
/*
* Flush the unicast and multicast chains
@@ -5870,6 +6758,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_uc_flush(dev);
dev_mc_flush(dev);
+ /* Send a netdev-removed uevent to the old namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+
/* Actually switch the network namespace */
dev_net_set(dev, net);
@@ -5881,6 +6772,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev->iflink = dev->ifindex;
}
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
@@ -5895,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
synchronize_net();
err = 0;
@@ -5936,17 +6830,22 @@ static int dev_cpu_callback(struct notifier_block *nfb,
oldsd->output_queue = NULL;
oldsd->output_queue_tailp = &oldsd->output_queue;
}
+ /* Append NAPI poll list from offline CPU. */
+ if (!list_empty(&oldsd->poll_list)) {
+ list_splice_init(&oldsd->poll_list, &sd->poll_list);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ }
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
- netif_rx(skb);
+ netif_rx_internal(skb);
input_queue_head_incr(oldsd);
}
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
- netif_rx(skb);
+ netif_rx_internal(skb);
input_queue_head_incr(oldsd);
}
@@ -5964,38 +6863,25 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-unsigned long netdev_increment_features(unsigned long all, unsigned long one,
- unsigned long mask)
-{
- /* If device needs checksumming, downgrade to it. */
- if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
- all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
- else if (mask & NETIF_F_ALL_CSUM) {
- /* If one device supports v4/v6 checksumming, set for all. */
- if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
- !(all & NETIF_F_GEN_CSUM)) {
- all &= ~NETIF_F_ALL_CSUM;
- all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- }
-
- /* If one device supports hw checksumming, set for all. */
- if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
- all &= ~NETIF_F_ALL_CSUM;
- all |= NETIF_F_HW_CSUM;
- }
- }
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
+{
+ if (mask & NETIF_F_GEN_CSUM)
+ mask |= NETIF_F_ALL_CSUM;
+ mask |= NETIF_F_VLAN_CHALLENGED;
- one |= NETIF_F_ALL_CSUM;
+ all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+ all &= one | ~NETIF_F_ALL_FOR_ALL;
- one |= all & NETIF_F_ONE_FOR_ALL;
- all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
- all |= one & mask & NETIF_F_ONE_FOR_ALL;
+ /* If one device supports hw checksumming, set for all. */
+ if (all & NETIF_F_GEN_CSUM)
+ all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
return all;
}
EXPORT_SYMBOL(netdev_increment_features);
-static struct hlist_head *netdev_create_hash(void)
+static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
struct hlist_head *hash;
@@ -6011,7 +6897,8 @@ static struct hlist_head *netdev_create_hash(void)
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
- INIT_LIST_HEAD(&net->dev_base_head);
+ if (net != &init_net)
+ INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
@@ -6032,29 +6919,23 @@ err_name:
/**
* netdev_drivername - network driver for the device
* @dev: network device
- * @buffer: buffer for resulting name
- * @len: size of buffer
*
* Determine network driver for device.
*/
-char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
+const char *netdev_drivername(const struct net_device *dev)
{
const struct device_driver *driver;
const struct device *parent;
-
- if (len <= 0 || !buffer)
- return buffer;
- buffer[0] = 0;
+ const char *empty = "";
parent = dev->dev.parent;
-
if (!parent)
- return buffer;
+ return empty;
driver = parent->driver;
if (driver && driver->name)
- strlcpy(buffer, driver->name, len);
- return buffer;
+ return driver->name;
+ return empty;
}
static int __netdev_printk(const char *level, const struct net_device *dev,
@@ -6062,13 +6943,18 @@ static int __netdev_printk(const char *level, const struct net_device *dev,
{
int r;
- if (dev && dev->dev.parent)
- r = dev_printk(level, dev->dev.parent, "%s: %pV",
- netdev_name(dev), vaf);
- else if (dev)
+ if (dev && dev->dev.parent) {
+ r = dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), vaf);
+ } else if (dev) {
r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
- else
+ } else {
r = printk("%s(NULL net_device): %pV", level, vaf);
+ }
return r;
}
@@ -6086,6 +6972,7 @@ int netdev_printk(const char *level, const struct net_device *dev,
vaf.va = &args;
r = __netdev_printk(level, dev, &vaf);
+
va_end(args);
return r;
@@ -6105,6 +6992,7 @@ int func(const struct net_device *dev, const char *fmt, ...) \
vaf.va = &args; \
\
r = __netdev_printk(level, dev, &vaf); \
+ \
va_end(args); \
\
return r; \
@@ -6150,22 +7038,50 @@ static void __net_exit default_device_exit(struct net *net)
if (dev->rtnl_link_ops)
continue;
- /* Push remaing network devices to init_net */
+ /* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
- printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
- __func__, dev->name, err);
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
BUG();
}
}
rtnl_unlock();
}
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
- * namespace. Do this in the reverse order of registeration.
+ * namespace. Do this in the reverse order of registration.
* Do this across as many network namespaces as possible to
* improve batching efficiency.
*/
@@ -6173,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- rtnl_lock();
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops)
@@ -6218,6 +7145,8 @@ static int __init net_dev_init(void)
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
+ INIT_LIST_HEAD(&offload_base);
+
if (register_pernet_subsys(&netdev_net_ops))
goto out;
@@ -6228,24 +7157,18 @@ static int __init net_dev_init(void)
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
- memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
- sd->completion_queue = NULL;
INIT_LIST_HEAD(&sd->poll_list);
- sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
sd->csd.func = rps_trigger_softirq;
sd->csd.info = sd;
- sd->csd.flags = 0;
sd->cpu = i;
#endif
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
- sd->backlog.gro_list = NULL;
- sd->backlog.gro_count = 0;
}
dev_boot_phase = 0;
@@ -6270,19 +7193,9 @@ static int __init net_dev_init(void)
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
- dev_mcast_init();
rc = 0;
out:
return rc;
}
subsys_initcall(net_dev_init);
-
-static int __init initialize_hashrnd(void)
-{
- get_random_bytes(&hashrnd, sizeof(hashrnd));
- return 0;
-}
-
-late_initcall_sync(initialize_hashrnd);
-
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 508f9c18992..b6b230600b9 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -13,20 +13,46 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
+#include <linux/export.h>
#include <linux/list.h>
-#include <linux/proc_fs.h>
/*
* General list handling functions
*/
-static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
- unsigned char *addr, int addr_len,
- unsigned char addr_type, bool global)
+static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global,
+ bool sync)
{
struct netdev_hw_addr *ha;
int alloc_size;
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ ha->refcount = 1;
+ ha->global_use = global;
+ ha->synced = sync ? 1 : 0;
+ ha->sync_cnt = 0;
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
+}
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync,
+ int sync_count)
+{
+ struct netdev_hw_addr *ha;
+
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
@@ -40,115 +66,133 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
else
ha->global_use = true;
}
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
ha->refcount++;
return 0;
}
}
-
- alloc_size = sizeof(*ha);
- if (alloc_size < L1_CACHE_BYTES)
- alloc_size = L1_CACHE_BYTES;
- ha = kmalloc(alloc_size, GFP_ATOMIC);
- if (!ha)
- return -ENOMEM;
- memcpy(ha->addr, addr, addr_len);
- ha->type = addr_type;
- ha->refcount = 1;
- ha->global_use = global;
- ha->synced = false;
- list_add_tail_rcu(&ha->list, &list->list);
- list->count++;
- return 0;
+ return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
+ sync);
}
-static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
{
- return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
+ return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
+ 0);
}
-static void ha_rcu_free(struct rcu_head *head)
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha, bool global,
+ bool sync)
{
- struct netdev_hw_addr *ha;
+ if (global && !ha->global_use)
+ return -ENOENT;
+
+ if (sync && !ha->synced)
+ return -ENOENT;
- ha = container_of(head, struct netdev_hw_addr, rcu_head);
- kfree(ha);
+ if (global)
+ ha->global_use = false;
+
+ if (sync)
+ ha->synced--;
+
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ list->count--;
+ return 0;
}
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
- unsigned char *addr, int addr_len,
- unsigned char addr_type, bool global)
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
list_for_each_entry(ha, &list->list, list) {
if (!memcmp(ha->addr, addr, addr_len) &&
- (ha->type == addr_type || !addr_type)) {
- if (global) {
- if (!ha->global_use)
- break;
- else
- ha->global_use = false;
- }
- if (--ha->refcount)
- return 0;
- list_del_rcu(&ha->list);
- call_rcu(&ha->rcu_head, ha_rcu_free);
- list->count--;
- return 0;
- }
+ (ha->type == addr_type || !addr_type))
+ return __hw_addr_del_entry(list, ha, global, sync);
}
return -ENOENT;
}
-static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
{
- return __hw_addr_del_ex(list, addr, addr_len, addr_type, false);
+ return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
}
-int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
{
int err;
- struct netdev_hw_addr *ha, *ha2;
- unsigned char type;
- list_for_each_entry(ha, &from_list->list, list) {
- type = addr_type ? addr_type : ha->type;
- err = __hw_addr_add(to_list, ha->addr, addr_len, type);
- if (err)
- goto unroll;
+ err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true, ha->sync_cnt);
+ if (err && err != -EEXIST)
+ return err;
+
+ if (!err) {
+ ha->sync_cnt++;
+ ha->refcount++;
}
+
return 0;
+}
-unroll:
- list_for_each_entry(ha2, &from_list->list, list) {
- if (ha2 == ha)
- break;
- type = addr_type ? addr_type : ha2->type;
- __hw_addr_del(to_list, ha2->addr, addr_len, type);
- }
- return err;
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true);
+ if (err)
+ return;
+ ha->sync_cnt--;
+ /* address on from list is not marked synced */
+ __hw_addr_del_entry(from_list, ha, false, false);
}
-EXPORT_SYMBOL(__hw_addr_add_multiple);
-void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len, unsigned char addr_type)
+static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
- struct netdev_hw_addr *ha;
- unsigned char type;
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
- list_for_each_entry(ha, &from_list->list, list) {
- type = addr_type ? addr_type : ha->type;
- __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt == ha->refcount) {
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ } else {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ }
}
+ return err;
}
-EXPORT_SYMBOL(__hw_addr_del_multiple);
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destionation of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list,
int addr_len)
@@ -157,17 +201,12 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
- if (!ha->synced) {
- err = __hw_addr_add(to_list, ha->addr,
- addr_len, ha->type);
+ if (!ha->sync_cnt) {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
if (err)
break;
- ha->synced = true;
- ha->refcount++;
- } else if (ha->refcount == 1) {
- __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
- __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
- }
+ } else if (ha->refcount == 1)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
}
return err;
}
@@ -180,28 +219,107 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
- if (ha->synced) {
- __hw_addr_del(to_list, ha->addr,
- addr_len, ha->type);
- ha->synced = false;
- __hw_addr_del(from_list, ha->addr,
- addr_len, ha->type);
- }
+ if (ha->sync_cnt)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
}
}
EXPORT_SYMBOL(__hw_addr_unsync);
-void __hw_addr_flush(struct netdev_hw_addr_list *list)
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchonized addresses from device
+ * @list: address list to remove syncronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
list_del_rcu(&ha->list);
- call_rcu(&ha->rcu_head, ha_rcu_free);
+ kfree_rcu(ha, rcu_head);
}
list->count = 0;
}
-EXPORT_SYMBOL(__hw_addr_flush);
void __hw_addr_init(struct netdev_hw_addr_list *list)
{
@@ -276,7 +394,7 @@ EXPORT_SYMBOL(dev_addr_init);
*
* The caller must hold the rtnl_mutex.
*/
-int dev_addr_add(struct net_device *dev, unsigned char *addr,
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
@@ -301,7 +419,7 @@ EXPORT_SYMBOL(dev_addr_add);
*
* The caller must hold the rtnl_mutex.
*/
-int dev_addr_del(struct net_device *dev, unsigned char *addr,
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
@@ -315,7 +433,8 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,
*/
ha = list_first_entry(&dev->dev_addrs.list,
struct netdev_hw_addr, list);
- if (ha->addr == dev->dev_addr && ha->refcount == 1)
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == addr_type && ha->refcount == 1)
return -ENOENT;
err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
@@ -326,62 +445,37 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,
}
EXPORT_SYMBOL(dev_addr_del);
+/*
+ * Unicast list handling functions
+ */
+
/**
- * dev_addr_add_multiple - Add device addresses from another device
- * @to_dev: device to which addresses will be added
- * @from_dev: device from which addresses will be added
- * @addr_type: address type - 0 means type will be used from from_dev
- *
- * Add device addresses of the one device to another.
- **
- * The caller must hold the rtnl_mutex.
+ * dev_uc_add_excl - Add a global secondary unicast address
+ * @dev: device
+ * @addr: address to add
*/
-int dev_addr_add_multiple(struct net_device *to_dev,
- struct net_device *from_dev,
- unsigned char addr_type)
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
+ struct netdev_hw_addr *ha;
int err;
- ASSERT_RTNL();
-
- if (from_dev->addr_len != to_dev->addr_len)
- return -EINVAL;
- err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
- to_dev->addr_len, addr_type);
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->uc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_UNICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false);
if (!err)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
return err;
}
-EXPORT_SYMBOL(dev_addr_add_multiple);
-
-/**
- * dev_addr_del_multiple - Delete device addresses by another device
- * @to_dev: device where the addresses will be deleted
- * @from_dev: device by which addresses the addresses will be deleted
- * @addr_type: address type - 0 means type will used from from_dev
- *
- * Deletes addresses in to device by the list of addresses in from device.
- *
- * The caller must hold the rtnl_mutex.
- */
-int dev_addr_del_multiple(struct net_device *to_dev,
- struct net_device *from_dev,
- unsigned char addr_type)
-{
- ASSERT_RTNL();
-
- if (from_dev->addr_len != to_dev->addr_len)
- return -EINVAL;
- __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
- to_dev->addr_len, addr_type);
- call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
- return 0;
-}
-EXPORT_SYMBOL(dev_addr_del_multiple);
-
-/*
- * Unicast list handling functions
- */
+EXPORT_SYMBOL(dev_uc_add_excl);
/**
* dev_uc_add - Add a secondary unicast address
@@ -391,7 +485,7 @@ EXPORT_SYMBOL(dev_addr_del_multiple);
* Add a secondary unicast address to the device or increase
* the reference count if it already exists.
*/
-int dev_uc_add(struct net_device *dev, unsigned char *addr)
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
{
int err;
@@ -413,7 +507,7 @@ EXPORT_SYMBOL(dev_uc_add);
* Release reference to a secondary unicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_uc_del(struct net_device *dev, unsigned char *addr)
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
{
int err;
@@ -434,10 +528,11 @@ EXPORT_SYMBOL(dev_uc_del);
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
- * locked by netif_tx_lock_bh.
+ * locked by netif_addr_lock_bh.
*
* This function is intended to be called from the dev->set_rx_mode
- * function of layered software devices.
+ * function of layered software devices. This function assumes that
+ * addresses will only ever be synced to the @to devices and no other.
*/
int dev_uc_sync(struct net_device *to, struct net_device *from)
{
@@ -446,16 +541,46 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_bh(to);
+ netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
- netif_addr_unlock_bh(to);
+ netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_uc_sync);
/**
+ * dev_uc_sync_multiple - Synchronize device's unicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have been deleted from the source. The source device
+ * must be locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. It allows for a single source
+ * device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
+/**
* dev_uc_unsync - Remove synchronized addresses from the destination device
* @to: destination device
* @from: source device
@@ -470,7 +595,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock(to);
+ netif_addr_lock_nested(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -508,14 +633,42 @@ EXPORT_SYMBOL(dev_uc_init);
* Multicast list handling functions
*/
-static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
+/**
+ * dev_mc_add_excl - Add a global secondary multicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->mc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_add_excl);
+
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, global);
+ NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
@@ -529,7 +682,7 @@ static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
* Add a multicast address to the device or increase
* the reference count if it already exists.
*/
-int dev_mc_add(struct net_device *dev, unsigned char *addr)
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, false);
}
@@ -542,20 +695,20 @@ EXPORT_SYMBOL(dev_mc_add);
*
* Add a global multicast address to the device.
*/
-int dev_mc_add_global(struct net_device *dev, unsigned char *addr)
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_add_global);
-static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, global);
+ NETDEV_HW_ADDR_T_MULTICAST, global, false);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
@@ -570,7 +723,7 @@ static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_mc_del(struct net_device *dev, unsigned char *addr)
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, false);
}
@@ -584,23 +737,23 @@ EXPORT_SYMBOL(dev_mc_del);
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
-int dev_mc_del_global(struct net_device *dev, unsigned char *addr)
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_del_global);
/**
- * dev_mc_sync - Synchronize device's unicast list to another device
+ * dev_mc_sync - Synchronize device's multicast list to another device
* @to: destination device
* @from: source device
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
- * locked by netif_tx_lock_bh.
+ * locked by netif_addr_lock_bh.
*
- * This function is intended to be called from the dev->set_multicast_list
- * or dev->set_rx_mode function of layered software devices.
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
*/
int dev_mc_sync(struct net_device *to, struct net_device *from)
{
@@ -609,16 +762,46 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_bh(to);
+ netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
- netif_addr_unlock_bh(to);
+ netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_mc_sync);
/**
+ * dev_mc_sync_multiple - Synchronize device's multicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices. It allows for a single
+ * source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
+/**
* dev_mc_unsync - Remove synchronized addresses from the destination device
* @to: destination device
* @from: source device
@@ -633,7 +816,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock(to);
+ netif_addr_lock_nested(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -666,76 +849,3 @@ void dev_mc_init(struct net_device *dev)
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);
-
-#ifdef CONFIG_PROC_FS
-#include <linux/seq_file.h>
-
-static int dev_mc_seq_show(struct seq_file *seq, void *v)
-{
- struct netdev_hw_addr *ha;
- struct net_device *dev = v;
-
- if (v == SEQ_START_TOKEN)
- return 0;
-
- netif_addr_lock_bh(dev);
- netdev_for_each_mc_addr(ha, dev) {
- int i;
-
- seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
- dev->name, ha->refcount, ha->global_use);
-
- for (i = 0; i < dev->addr_len; i++)
- seq_printf(seq, "%02x", ha->addr[i]);
-
- seq_putc(seq, '\n');
- }
- netif_addr_unlock_bh(dev);
- return 0;
-}
-
-static const struct seq_operations dev_mc_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = dev_mc_seq_show,
-};
-
-static int dev_mc_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &dev_mc_seq_ops,
- sizeof(struct seq_net_private));
-}
-
-static const struct file_operations dev_mc_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_mc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-#endif
-
-static int __net_init dev_mc_net_init(struct net *net)
-{
- if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-static void __net_exit dev_mc_net_exit(struct net *net)
-{
- proc_net_remove(net, "dev_mcast");
-}
-
-static struct pernet_operations __net_initdata dev_mc_net_ops = {
- .init = dev_mc_net_init,
- .exit = dev_mc_net_exit,
-};
-
-void __init dev_mcast_init(void)
-{
- register_pernet_subsys(&dev_mc_net_ops);
-}
-
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
new file mode 100644
index 00000000000..cf999e09bcd
--- /dev/null
+++ b/net/core/dev_ioctl.c
@@ -0,0 +1,567 @@
+#include <linux/kmod.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/wireless.h>
+#include <net/wext.h>
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq __user *arg)
+{
+ struct ifreq ifr;
+ int error;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return 0;
+}
+
+static gifconf_func_t *gifconf_list[NPROTO];
+
+/**
+ * register_gifconf - register a SIOCGIF handler
+ * @family: Address family
+ * @gifconf: Function handler
+ *
+ * Register protocol dependent address dumping routines. The handler
+ * that is passed must not be freed or reused until it has been replaced
+ * by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+{
+ if (family >= NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+EXPORT_SYMBOL(register_gifconf);
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(struct net *net, char __user *arg)
+{
+ struct ifconf ifc;
+ struct net_device *dev;
+ char __user *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for_each_netdev(net, dev) {
+ for (i = 0; i < NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (!pos)
+ done = gifconf_list[i](dev, NULL, 0);
+ else
+ done = gifconf_list[i](dev, pos + total,
+ len - total);
+ if (done < 0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc.ifc_len = total;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (short) dev_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFHWADDR:
+ if (!dev->addr_len)
+ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+ else
+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ ifr->ifr_hwaddr.sa_family = dev->type;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start = dev->mem_start;
+ ifr->ifr_map.mem_end = dev->mem_end;
+ ifr->ifr_map.base_addr = dev->base_addr;
+ ifr->ifr_map.irq = dev->irq;
+ ifr->ifr_map.dma = dev->dma;
+ ifr->ifr_map.port = dev->if_port;
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -ENOTTY;
+ break;
+
+ }
+ return err;
+}
+
+static int net_hwtstamp_validate(struct ifreq *ifr)
+{
+ struct hwtstamp_config cfg;
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ if (cfg.flags) /* reserved for future extensions */
+ return -EINVAL;
+
+ tx_type = cfg.tx_type;
+ rx_filter = cfg.rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ tx_type_valid = 1;
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ rx_filter_valid = 1;
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ const struct net_device_ops *ops;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->netdev_ops;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ if (ops->ndo_set_config) {
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_set_config(dev, &ifr->ifr_map);
+ }
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCDELMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ dev->tx_queue_len = ifr->ifr_qlen;
+ return 0;
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ case SIOCSHWTSTAMP:
+ err = net_hwtstamp_validate(ifr);
+ if (err)
+ return err;
+ /* fall through */
+
+ /*
+ * Unknown or private ioctl
+ */
+ default:
+ if ((cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) ||
+ cmd == SIOCBONDENSLAVE ||
+ cmd == SIOCBONDRELEASE ||
+ cmd == SIOCBONDSETHWADDR ||
+ cmd == SIOCBONDSLAVEINFOQUERY ||
+ cmd == SIOCBONDINFOQUERY ||
+ cmd == SIOCBONDCHANGEACTIVE ||
+ cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCBRADDIF ||
+ cmd == SIOCBRDELIF ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP ||
+ cmd == SIOCWANDEV) {
+ err = -EOPNOTSUPP;
+ if (ops->ndo_do_ioctl) {
+ if (netif_device_present(dev))
+ err = ops->ndo_do_ioctl(dev, ifr, cmd);
+ else
+ err = -ENODEV;
+ }
+ } else
+ err = -EINVAL;
+
+ }
+ return err;
+}
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int no_module;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ rcu_read_unlock();
+
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE)) {
+ if (!request_module("%s", name))
+ pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
+ name);
+ }
+}
+EXPORT_SYMBOL(dev_load);
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @arg: pointer to a struct ifreq in user space
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ int ret;
+ char *colon;
+
+ /* One special case: SIOCGIFCONF takes ifconf argument
+ and requires shared lock, because it sleeps writing
+ to user space.
+ */
+
+ if (cmd == SIOCGIFCONF) {
+ rtnl_lock();
+ ret = dev_ifconf(net, (char __user *) arg);
+ rtnl_unlock();
+ return ret;
+ }
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, (struct ifreq __user *)arg);
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr.ifr_name);
+ rcu_read_lock();
+ ret = dev_ifsioc_locked(net, &ifr, cmd);
+ rcu_read_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ethtool(net, &ifr);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFMAP:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ /*
+ * These ioctl calls:
+ * - require local superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ case SIOCSHWTSTAMP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -ENOTTY;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ cmd == SIOCGHWTSTAMP ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret && copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ return ret;
+ }
+ /* Take care of Wireless Extensions */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
+ return wext_handle_ioctl(net, &ifr, cmd, arg);
+ return -ENOTTY;
+ }
+}
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 36e603c78ce..e70301eb7a4 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -4,6 +4,8 @@
* Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
@@ -22,6 +24,7 @@
#include <linux/timer.h>
#include <linux/bitops.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <net/genetlink.h>
#include <net/netevent.h>
@@ -33,22 +36,19 @@
#define TRACE_ON 1
#define TRACE_OFF 0
-static void send_dm_alert(struct work_struct *unused);
-
-
/*
* Globals, our netlink socket pointer
* and the work handle that will send up
* netlink alerts
*/
static int trace_state = TRACE_OFF;
-static DEFINE_SPINLOCK(trace_state_lock);
+static DEFINE_MUTEX(trace_state_mutex);
struct per_cpu_dm_data {
- struct work_struct dm_alert_work;
- struct sk_buff *skb;
- atomic_t dm_hit_count;
- struct timer_list send_timer;
+ spinlock_t lock;
+ struct sk_buff *skb;
+ struct work_struct dm_alert_work;
+ struct timer_list send_timer;
};
struct dm_hw_stat_delta {
@@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = {
.hdrsize = 0,
.name = "NET_DM",
.version = 2,
- .maxattr = NET_DM_CMD_MAX,
};
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
@@ -74,56 +73,64 @@ static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
static LIST_HEAD(hw_stats_list);
-static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
{
size_t al;
struct net_dm_alert_msg *msg;
struct nlattr *nla;
+ struct sk_buff *skb;
+ unsigned long flags;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
al += sizeof(struct nlattr);
- data->skb = genlmsg_new(al, GFP_KERNEL);
- genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
- 0, NET_DM_CMD_ALERT);
- nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg));
- msg = nla_data(nla);
- memset(msg, 0, al);
- atomic_set(&data->dm_hit_count, dm_hit_limit);
+ skb = genlmsg_new(al, GFP_KERNEL);
+
+ if (skb) {
+ genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ } else {
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+ }
+
+ spin_lock_irqsave(&data->lock, flags);
+ swap(data->skb, skb);
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ return skb;
}
-static void send_dm_alert(struct work_struct *unused)
+static struct genl_multicast_group dropmon_mcgrps[] = {
+ { .name = "events", },
+};
+
+static void send_dm_alert(struct work_struct *work)
{
struct sk_buff *skb;
- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
-
- /*
- * Grab the skb we're about to send
- */
- skb = data->skb;
+ struct per_cpu_dm_data *data;
- /*
- * Replace it with a new one
- */
- reset_per_cpu_data(data);
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
- /*
- * Ship it!
- */
- genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+ skb = reset_per_cpu_data(data);
+ if (skb)
+ genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+ 0, GFP_KERNEL);
}
/*
* This is the timer function to delay the sending of an alert
* in the event that more drops will arrive during the
- * hysteresis period. Note that it operates under the timer interrupt
- * so we don't need to disable preemption here
+ * hysteresis period.
*/
-static void sched_send_work(unsigned long unused)
+static void sched_send_work(unsigned long _data)
{
- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+ struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
schedule_work(&data->dm_alert_work);
}
@@ -134,17 +141,19 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+ struct sk_buff *dskb;
+ struct per_cpu_dm_data *data;
+ unsigned long flags;
+ local_irq_save(flags);
+ data = &__get_cpu_var(dm_cpu_data);
+ spin_lock(&data->lock);
+ dskb = data->skb;
- if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
- /*
- * we're already at zero, discard this hit
- */
+ if (!dskb)
goto out;
- }
- nlh = (struct nlmsghdr *)data->skb->data;
+ nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
for (i = 0; i < msg->entries; i++) {
@@ -153,11 +162,12 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
goto out;
}
}
-
+ if (msg->entries == dm_hit_limit)
+ goto out;
/*
* We need to create a new entry
*/
- __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
msg->points[msg->entries].count = 1;
@@ -165,11 +175,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
if (!timer_pending(&data->send_timer)) {
data->send_timer.expires = jiffies + dm_delay * HZ;
- add_timer_on(&data->send_timer, smp_processor_id());
+ add_timer(&data->send_timer);
}
out:
- return;
+ spin_unlock_irqrestore(&data->lock, flags);
}
static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
@@ -207,21 +217,13 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
rcu_read_unlock();
}
-
-static void free_dm_hw_stat(struct rcu_head *head)
-{
- struct dm_hw_stat_delta *n;
- n = container_of(head, struct dm_hw_stat_delta, rcu);
- kfree(n);
-}
-
static int set_all_monitor_traces(int state)
{
int rc = 0;
struct dm_hw_stat_delta *new_stat = NULL;
struct dm_hw_stat_delta *temp;
- spin_lock(&trace_state_lock);
+ mutex_lock(&trace_state_mutex);
if (state == trace_state) {
rc = -EAGAIN;
@@ -230,9 +232,15 @@ static int set_all_monitor_traces(int state)
switch (state) {
case TRACE_ON:
+ if (!try_module_get(THIS_MODULE)) {
+ rc = -ENODEV;
+ break;
+ }
+
rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
break;
+
case TRACE_OFF:
rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
@@ -245,9 +253,12 @@ static int set_all_monitor_traces(int state)
list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
if (new_stat->dev == NULL) {
list_del_rcu(&new_stat->list);
- call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ kfree_rcu(new_stat, rcu);
}
}
+
+ module_put(THIS_MODULE);
+
break;
default:
rc = 1;
@@ -260,7 +271,7 @@ static int set_all_monitor_traces(int state)
rc = -EINPROGRESS;
out_unlock:
- spin_unlock(&trace_state_lock);
+ mutex_unlock(&trace_state_mutex);
return rc;
}
@@ -288,9 +299,9 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
}
static int dropmon_net_event(struct notifier_block *ev_block,
- unsigned long event, void *ptr)
+ unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct dm_hw_stat_delta *new_stat = NULL;
struct dm_hw_stat_delta *tmp;
@@ -303,30 +314,30 @@ static int dropmon_net_event(struct notifier_block *ev_block,
new_stat->dev = dev;
new_stat->last_rx = jiffies;
- spin_lock(&trace_state_lock);
+ mutex_lock(&trace_state_mutex);
list_add_rcu(&new_stat->list, &hw_stats_list);
- spin_unlock(&trace_state_lock);
+ mutex_unlock(&trace_state_mutex);
break;
case NETDEV_UNREGISTER:
- spin_lock(&trace_state_lock);
+ mutex_lock(&trace_state_mutex);
list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
if (new_stat->dev == dev) {
new_stat->dev = NULL;
if (trace_state == TRACE_OFF) {
list_del_rcu(&new_stat->list);
- call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ kfree_rcu(new_stat, rcu);
break;
}
}
}
- spin_unlock(&trace_state_lock);
+ mutex_unlock(&trace_state_mutex);
break;
}
out:
return NOTIFY_DONE;
}
-static struct genl_ops dropmon_ops[] = {
+static const struct genl_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
.doit = net_dm_cmd_config,
@@ -350,38 +361,40 @@ static int __init init_net_drop_monitor(void)
struct per_cpu_dm_data *data;
int cpu, rc;
- printk(KERN_INFO "Initalizing network drop monitor service\n");
+ pr_info("Initializing network drop monitor service\n");
if (sizeof(void *) > 8) {
- printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
+ pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
return -ENOSPC;
}
- rc = genl_register_family_with_ops(&net_drop_monitor_family,
- dropmon_ops,
- ARRAY_SIZE(dropmon_ops));
+ rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
+ dropmon_ops, dropmon_mcgrps);
if (rc) {
- printk(KERN_ERR "Could not create drop monitor netlink family\n");
+ pr_err("Could not create drop monitor netlink family\n");
return rc;
}
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
- printk(KERN_CRIT "Failed to register netdevice notifier\n");
+ pr_crit("Failed to register netdevice notifier\n");
goto out_unreg;
}
rc = 0;
- for_each_present_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
data = &per_cpu(dm_cpu_data, cpu);
- reset_per_cpu_data(data);
INIT_WORK(&data->dm_alert_work, send_dm_alert);
init_timer(&data->send_timer);
- data->send_timer.data = cpu;
+ data->send_timer.data = (unsigned long)data;
data->send_timer.function = sched_send_work;
+ spin_lock_init(&data->lock);
+ reset_per_cpu_data(data);
}
+
goto out;
out_unreg:
@@ -390,4 +403,37 @@ out:
return rc;
}
-late_initcall(init_net_drop_monitor);
+static void exit_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu;
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+
+ /*
+ * Because of the module_get/put we do in the trace state change path
+ * we are guarnateed not to have any current users when we get here
+ * all we need to do is make sure that we don't have any running timers
+ * or pending schedule calls
+ */
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ /*
+ * At this point, we should have exclusive access
+ * to this struct and can free the skb inside it
+ */
+ kfree_skb(data->skb);
+ }
+
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+}
+
+module_init(init_net_drop_monitor);
+module_exit(exit_net_drop_monitor);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
+MODULE_ALIAS_GENL_FAMILY("NET_DM");
diff --git a/net/core/dst.c b/net/core/dst.c
index b99c7c7ffce..a028409ee43 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <net/net_namespace.h>
#include <linux/sched.h>
+#include <linux/prefetch.h>
#include <net/dst.h>
@@ -33,9 +34,6 @@
* 3) This list is guarded by a mutex,
* so that the gc_task and dst_dev_event() can be synchronized.
*/
-#if RT_CACHE_DEBUG >= 2
-static atomic_t dst_total = ATOMIC_INIT(0);
-#endif
/*
* We want to keep lock & list close together
@@ -69,10 +67,6 @@ static void dst_gc_task(struct work_struct *work)
unsigned long expires = ~0L;
struct dst_entry *dst, *next, head;
struct dst_entry *last = &head;
-#if RT_CACHE_DEBUG >= 2
- ktime_t time_start = ktime_get();
- struct timespec elapsed;
-#endif
mutex_lock(&dst_gc_mutex);
next = dst_busy_list;
@@ -100,7 +94,7 @@ loop:
* But we do not have state "obsoleted, but
* referenced by parent", so it is right.
*/
- if (dst->obsolete > 1)
+ if (dst->obsolete > 0)
continue;
___dst_free(dst);
@@ -146,25 +140,27 @@ loop:
spin_unlock_bh(&dst_garbage.lock);
mutex_unlock(&dst_gc_mutex);
-#if RT_CACHE_DEBUG >= 2
- elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start));
- printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d"
- " expires: %lu elapsed: %lu us\n",
- atomic_read(&dst_total), delayed, work_performed,
- expires,
- elapsed.tv_sec * USEC_PER_SEC +
- elapsed.tv_nsec / NSEC_PER_USEC);
-#endif
}
-int dst_discard(struct sk_buff *skb)
+int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
-EXPORT_SYMBOL(dst_discard);
+EXPORT_SYMBOL(dst_discard_sk);
-void *dst_alloc(struct dst_ops *ops)
+const u32 dst_default_metrics[RTAX_MAX + 1] = {
+ /* This initializer is needed to force linker to place this variable
+ * into const section. Otherwise it might end into bss section.
+ * We really want to avoid false sharing on this variable, and catch
+ * any writes on it.
+ */
+ [RTAX_MAX] = 0xdeadbeef,
+};
+
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
@@ -172,18 +168,38 @@ void *dst_alloc(struct dst_ops *ops)
if (ops->gc(ops))
return NULL;
}
- dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
- atomic_set(&dst->__refcnt, 0);
+ dst->child = NULL;
+ dst->dev = dev;
+ if (dev)
+ dev_hold(dev);
dst->ops = ops;
- dst->lastuse = jiffies;
+ dst_init_metrics(dst, dst_default_metrics, true);
+ dst->expires = 0UL;
dst->path = dst;
- dst->input = dst->output = dst_discard;
-#if RT_CACHE_DEBUG >= 2
- atomic_inc(&dst_total);
+ dst->from = NULL;
+#ifdef CONFIG_XFRM
+ dst->xfrm = NULL;
+#endif
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ dst->error = 0;
+ dst->obsolete = initial_obsolete;
+ dst->header_len = 0;
+ dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->tclassid = 0;
#endif
- dst_entries_add(ops, 1);
+ atomic_set(&dst->__refcnt, initial_ref);
+ dst->__use = 0;
+ dst->lastuse = jiffies;
+ dst->flags = flags;
+ dst->pending_confirm = 0;
+ dst->next = NULL;
+ if (!(flags & DST_NOCOUNT))
+ dst_entries_add(ops, 1);
return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -193,9 +209,11 @@ static void ___dst_free(struct dst_entry *dst)
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
- dst->input = dst->output = dst_discard;
- dst->obsolete = 2;
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ }
+ dst->obsolete = DST_OBSOLETE_DEAD;
}
void __dst_free(struct dst_entry *dst)
@@ -207,8 +225,8 @@ void __dst_free(struct dst_entry *dst)
if (dst_garbage.timer_inc > DST_GC_INC) {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
- cancel_delayed_work(&dst_gc_work);
- schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
}
spin_unlock_bh(&dst_garbage.lock);
}
@@ -217,34 +235,19 @@ EXPORT_SYMBOL(__dst_free);
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
- struct neighbour *neigh;
- struct hh_cache *hh;
smp_rmb();
again:
- neigh = dst->neighbour;
- hh = dst->hh;
child = dst->child;
- dst->hh = NULL;
- if (hh)
- hh_cache_put(hh);
-
- if (neigh) {
- dst->neighbour = NULL;
- neigh_release(neigh);
- }
-
- dst_entries_add(dst->ops, -1);
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
if (dst->ops->destroy)
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
-#if RT_CACHE_DEBUG >= 2
- atomic_dec(&dst_total);
-#endif
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
@@ -266,6 +269,15 @@ again:
}
EXPORT_SYMBOL(dst_destroy);
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+}
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
@@ -273,37 +285,71 @@ void dst_release(struct dst_entry *dst)
newrefcnt = atomic_dec_return(&dst->__refcnt);
WARN_ON(newrefcnt < 0);
- if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
- dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
- }
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
/**
- * skb_dst_set_noref - sets skb dst, without a reference
+ * __skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
* @dst: dst entry
+ * @force: if force is set, use noref version even for DST_NOCACHE entries
*
* Sets skb dst, assuming a reference was not taken on dst
* skb_dst_drop() should not dst_release() this dst
*/
-void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
{
WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
/* If dst not in cache, we must take a reference, because
* dst_release() will destroy dst as soon as its refcount becomes zero
*/
- if (unlikely(dst->flags & DST_NOCACHE)) {
+ if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
dst_hold(dst);
skb_dst_set(skb, dst);
} else {
skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}
}
-EXPORT_SYMBOL(skb_dst_set_noref);
+EXPORT_SYMBOL(__skb_dst_set_noref);
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
@@ -323,27 +369,23 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
return;
if (!unregister) {
- dst->input = dst->output = dst_discard;
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
- if (dst->neighbour && dst->neighbour->dev == dev) {
- dst->neighbour->dev = dst->dev;
- dev_hold(dst->dev);
- dev_put(dev);
- }
}
}
static int dst_dev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct dst_entry *dst, *last = NULL;
switch (event) {
- case NETDEV_UNREGISTER:
+ case NETDEV_UNREGISTER_FINAL:
case NETDEV_DOWN:
mutex_lock(&dst_gc_mutex);
for (dst = dst_busy_list; dst; dst = dst->next) {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d5bc2881888..17cb912793f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -17,10 +17,14 @@
#include <linux/errno.h>
#include <linux/ethtool.h>
#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
#include <linux/bitops.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
/*
* Some useful ethtool_ops methods that're device independent.
@@ -34,152 +38,315 @@ u32 ethtool_op_get_link(struct net_device *dev)
}
EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_rx_csum(struct net_device *dev)
+int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
+ info->so_timestamping =
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ return 0;
}
-EXPORT_SYMBOL(ethtool_op_get_rx_csum);
+EXPORT_SYMBOL(ethtool_op_get_ts_info);
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
+/* Handlers for each ethtool command */
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
+ [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+};
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
- if (data)
- dev->features |= NETIF_F_IP_CSUM;
- else
- dev->features &= ~NETIF_F_IP_CSUM;
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 __user *sizeaddr;
+ u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
- return 0;
-}
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_HW_CSUM;
- else
- dev->features &= ~NETIF_F_HW_CSUM;
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
{
- if (data)
- dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
- else
- dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
- return 0;
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
+
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
+ __netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
}
-EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
-u32 ethtool_op_get_sg(struct net_device *dev)
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
- return (dev->features & NETIF_F_SG) != 0;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
}
-EXPORT_SYMBOL(ethtool_op_get_sg);
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
{
- if (data)
- dev->features |= NETIF_F_SG;
- else
- dev->features &= ~NETIF_F_SG;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
- return 0;
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
}
-EXPORT_SYMBOL(ethtool_op_set_sg);
-u32 ethtool_op_get_tso(struct net_device *dev)
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
{
- return (dev->features & NETIF_F_TSO) != 0;
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GUFO:
+ case ETHTOOL_SUFO:
+ return NETIF_F_UFO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
}
-EXPORT_SYMBOL(ethtool_op_get_tso);
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
{
- if (data)
- dev->features |= NETIF_F_TSO;
- else
- dev->features &= ~NETIF_F_TSO;
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(ethtool_op_set_tso);
-u32 ethtool_op_get_ufo(struct net_device *dev)
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
{
- return (dev->features & NETIF_F_UFO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_ufo);
+ struct ethtool_value edata;
+ netdev_features_t mask;
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_UFO;
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (!mask)
+ return -EOPNOTSUPP;
+
+ if (edata.data)
+ dev->wanted_features |= mask;
else
- dev->features &= ~NETIF_F_UFO;
+ dev->wanted_features &= ~mask;
+
+ __netdev_update_features(dev);
+
return 0;
}
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-/* the following list of flags are the same as their associated
- * NETIF_F_xxx values in include/linux/netdevice.h
- */
-static const u32 flags_dup_features =
- (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
- ETH_FLAG_RXHASH);
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
-u32 ethtool_op_get_flags(struct net_device *dev)
+static u32 __ethtool_get_flags(struct net_device *dev)
{
- /* in the future, this function will probably contain additional
- * handling for flags which are not so easily handled
- * by a simple masking operation
- */
-
- return dev->features & flags_dup_features;
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
}
-EXPORT_SYMBOL(ethtool_op_get_flags);
-int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
{
- if (data & ~supported)
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
return -EINVAL;
- dev->features = ((dev->features & ~flags_dup_features) |
- (data & flags_dup_features));
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
return 0;
}
-EXPORT_SYMBOL(ethtool_op_set_flags);
-void ethtool_ntuple_flush(struct net_device *dev)
+int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
- struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
+ ASSERT_RTNL();
- list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) {
- list_del(&fsc->list);
- kfree(fsc);
- }
- dev->ethtool_ntuple_list.count = 0;
-}
-EXPORT_SYMBOL(ethtool_ntuple_flush);
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
-/* Handlers for each ethtool command */
+ memset(cmd, 0, sizeof(struct ethtool_cmd));
+ cmd->cmd = ETHTOOL_GSET;
+ return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(__ethtool_get_settings);
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
int err;
+ struct ethtool_cmd cmd;
- if (!dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
-
- err = dev->ethtool_ops->get_settings(dev, &cmd);
+ err = __ethtool_get_settings(dev, &cmd);
if (err < 0)
return err;
@@ -209,7 +376,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GDRVINFO;
- if (ops && ops->get_drvinfo) {
+ if (ops->get_drvinfo) {
ops->get_drvinfo(dev, &info);
} else if (dev->dev.parent && dev->dev.parent->driver) {
strlcpy(info.bus_info, dev_name(dev->dev.parent),
@@ -224,7 +391,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
* this method of obtaining string set info is deprecated;
* Use ETHTOOL_GSSET_INFO instead.
*/
- if (ops && ops->get_sset_count) {
+ if (ops->get_sset_count) {
int rc;
rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -237,9 +404,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
if (rc >= 0)
info.n_priv_flags = rc;
}
- if (ops && ops->get_regs_len)
+ if (ops->get_regs_len)
info.regdump_len = ops->get_regs_len(dev);
- if (ops && ops->get_eeprom_len)
+ if (ops->get_eeprom_len)
info.eedump_len = ops->get_eeprom_len(dev);
if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -251,14 +418,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_sset_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u64 sset_mask;
int i, idx = 0, n_bits = 0, ret, rc;
u32 *info_buf = NULL;
- if (!ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&info, useraddr, sizeof(info)))
return -EFAULT;
@@ -285,7 +448,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
if (!(sset_mask & (1ULL << i)))
continue;
- rc = ops->get_sset_count(dev, i);
+ rc = __ethtool_get_sset_count(dev, i);
if (rc >= 0) {
info.sset_mask |= (1ULL << i);
info_buf[idx++] = rc;
@@ -312,6 +475,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
{
struct ethtool_rxnfc info;
size_t info_size = sizeof(info);
+ int rc;
if (!dev->ethtool_ops->set_rxnfc)
return -EOPNOTSUPP;
@@ -327,7 +491,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
- return dev->ethtool_ops->set_rxnfc(dev, &info);
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
}
static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
@@ -385,37 +557,64 @@ err_out:
return ret;
}
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ u32 user_size, dev_size;
+ u32 *indir;
int ret;
- if (!dev->ethtool_ops->get_rxfh_indir)
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kzalloc(full_size, GFP_USER);
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- indir->cmd = ETHTOOL_GRXFHINDIR;
- indir->size = table_size;
- ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
if (ret)
goto out;
- if (copy_to_user(useraddr, indir, full_size))
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
ret = -EFAULT;
out:
@@ -426,377 +625,220 @@ out:
static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
+
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
+ !ops->get_rxnfc)
+ return -EOPNOTSUPP;
- if (!dev->ethtool_ops->set_rxfh_indir)
+ dev_size = ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kmalloc(full_size, GFP_USER);
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- if (copy_from_user(indir, useraddr, full_size)) {
- ret = -EFAULT;
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
}
- ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
+ ret = ops->set_rxfh(dev, indir, NULL);
out:
kfree(indir);
return ret;
}
-static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
- struct ethtool_rx_ntuple_flow_spec *spec,
- struct ethtool_rx_ntuple_flow_spec_container *fsc)
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!(dev->ethtool_ops->get_rxfh_indir_size ||
+ dev->ethtool_ops->get_rxfh_key_size) ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
- /* don't add filters forever */
- if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) {
- /* free the container */
- kfree(fsc);
- return;
- }
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
- /* Copy the whole filter over */
- fsc->fs.flow_type = spec->flow_type;
- memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u));
- memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u));
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
- fsc->fs.vlan_tag = spec->vlan_tag;
- fsc->fs.vlan_tag_mask = spec->vlan_tag_mask;
- fsc->fs.data = spec->data;
- fsc->fs.data_mask = spec->data_mask;
- fsc->fs.action = spec->action;
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
- /* add to the list */
- list_add_tail_rcu(&fsc->list, &list->list);
- list->count++;
-}
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
-/*
- * ethtool does not (or did not) set masks for flow parameters that are
- * not specified, so if both value and mask are 0 then this must be
- * treated as equivalent to a mask with all bits set. Implement that
- * here rather than in drivers.
- */
-static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
-{
- struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
- struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
-
- if (fs->flow_type != TCP_V4_FLOW &&
- fs->flow_type != UDP_V4_FLOW &&
- fs->flow_type != SCTP_V4_FLOW)
- return;
-
- if (!(entry->ip4src | mask->ip4src))
- mask->ip4src = htonl(0xffffffff);
- if (!(entry->ip4dst | mask->ip4dst))
- mask->ip4dst = htonl(0xffffffff);
- if (!(entry->psrc | mask->psrc))
- mask->psrc = htons(0xffff);
- if (!(entry->pdst | mask->pdst))
- mask->pdst = htons(0xffff);
- if (!(entry->tos | mask->tos))
- mask->tos = 0xff;
- if (!(fs->vlan_tag | fs->vlan_tag_mask))
- fs->vlan_tag_mask = 0xffff;
- if (!(fs->data | fs->data_mask))
- fs->data_mask = 0xffffffffffffffffULL;
-}
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
-static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
- void __user *useraddr)
-{
- struct ethtool_rx_ntuple cmd;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
- int ret;
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size and key size. Otherwise, if the User size is
+ * not equal to device table size or key size it's an error.
+ */
+ if (!user_indir_size && !user_key_size)
+ return 0;
- if (!(dev->features & NETIF_F_NTUPLE))
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
return -EINVAL;
- if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
- return -EFAULT;
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
- rx_ntuple_fix_masks(&cmd.fs);
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
- /*
- * Cache filter in dev struct for GET operation only if
- * the underlying driver doesn't have its own GET operation, and
- * only if the filter was added successfully. First make sure we
- * can allocate the filter, then continue if successful.
- */
- if (!ops->get_rx_ntuple) {
- fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
- if (!fsc)
- return -ENOMEM;
- }
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
- ret = ops->set_rx_ntuple(dev, &cmd);
- if (ret) {
- kfree(fsc);
- return ret;
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
+ if (!ret) {
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size))
+ ret = -EFAULT;
}
- if (!ops->get_rx_ntuple)
- __rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc);
+ kfree(rss_config);
return ret;
}
-static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_gstrings gstrings;
+ int ret;
const struct ethtool_ops *ops = dev->ethtool_ops;
- struct ethtool_rx_ntuple_flow_spec_container *fsc;
- u8 *data;
- char *p;
- int ret, i, num_strings = 0;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+ if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
+ !ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
- if (!ops->get_sset_count)
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+ if ((dev_key_size + dev_indir_size) == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
- ret = ops->get_sset_count(dev, gstrings.string_set);
- if (ret < 0)
- return ret;
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
- gstrings.len = ret;
+ /* If either indir or hash key is valid, proceed further.
+ * It is not valid to request that both be unchanged.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0))
+ return -EINVAL;
- data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
- if (!data)
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
return -ENOMEM;
- if (ops->get_rx_ntuple) {
- /* driver-specific filter grab */
- ret = ops->get_rx_ntuple(dev, gstrings.string_set, data);
- goto copy;
- }
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
- /* default ethtool filter grab */
- i = 0;
- p = (char *)data;
- list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) {
- sprintf(p, "Filter %d:\n", i);
- p += ETH_GSTRING_LEN;
- num_strings++;
-
- switch (fsc->fs.flow_type) {
- case TCP_V4_FLOW:
- sprintf(p, "\tFlow Type: TCP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case UDP_V4_FLOW:
- sprintf(p, "\tFlow Type: UDP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case SCTP_V4_FLOW:
- sprintf(p, "\tFlow Type: SCTP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case AH_ESP_V4_FLOW:
- sprintf(p, "\tFlow Type: AH ESP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case ESP_V4_FLOW:
- sprintf(p, "\tFlow Type: ESP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case IP_USER_FLOW:
- sprintf(p, "\tFlow Type: Raw IP\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case IPV4_FLOW:
- sprintf(p, "\tFlow Type: IPv4\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- default:
- sprintf(p, "\tFlow Type: Unknown\n");
- p += ETH_GSTRING_LEN;
- num_strings++;
- goto unknown_filter;
- }
+ /* rxfh.indir_size == 0 means reset the indir table to default.
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
+ goto out;
+ } else if (rxfh.indir_size == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ }
- /* now the rest of the filters */
- switch (fsc->fs.flow_type) {
- case TCP_V4_FLOW:
- case UDP_V4_FLOW:
- case SCTP_V4_FLOW:
- sprintf(p, "\tSrc IP addr: 0x%x\n",
- fsc->fs.h_u.tcp_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSrc IP mask: 0x%x\n",
- fsc->fs.m_u.tcp_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP addr: 0x%x\n",
- fsc->fs.h_u.tcp_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP mask: 0x%x\n",
- fsc->fs.m_u.tcp_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
- fsc->fs.h_u.tcp_ip4_spec.psrc,
- fsc->fs.m_u.tcp_ip4_spec.psrc);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
- fsc->fs.h_u.tcp_ip4_spec.pdst,
- fsc->fs.m_u.tcp_ip4_spec.pdst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tTOS: %d, mask: 0x%x\n",
- fsc->fs.h_u.tcp_ip4_spec.tos,
- fsc->fs.m_u.tcp_ip4_spec.tos);
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case AH_ESP_V4_FLOW:
- case ESP_V4_FLOW:
- sprintf(p, "\tSrc IP addr: 0x%x\n",
- fsc->fs.h_u.ah_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSrc IP mask: 0x%x\n",
- fsc->fs.m_u.ah_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP addr: 0x%x\n",
- fsc->fs.h_u.ah_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP mask: 0x%x\n",
- fsc->fs.m_u.ah_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSPI: %d, mask: 0x%x\n",
- fsc->fs.h_u.ah_ip4_spec.spi,
- fsc->fs.m_u.ah_ip4_spec.spi);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tTOS: %d, mask: 0x%x\n",
- fsc->fs.h_u.ah_ip4_spec.tos,
- fsc->fs.m_u.ah_ip4_spec.tos);
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case IP_USER_FLOW:
- sprintf(p, "\tSrc IP addr: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSrc IP mask: 0x%x\n",
- fsc->fs.m_u.usr_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP addr: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP mask: 0x%x\n",
- fsc->fs.m_u.usr_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
- case IPV4_FLOW:
- sprintf(p, "\tSrc IP addr: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tSrc IP mask: 0x%x\n",
- fsc->fs.m_u.usr_ip4_spec.ip4src);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP addr: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tDest IP mask: 0x%x\n",
- fsc->fs.m_u.usr_ip4_spec.ip4dst);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
- fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tTOS: %d, mask: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.tos,
- fsc->fs.m_u.usr_ip4_spec.tos);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.ip_ver,
- fsc->fs.m_u.usr_ip4_spec.ip_ver);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
- fsc->fs.h_u.usr_ip4_spec.proto,
- fsc->fs.m_u.usr_ip4_spec.proto);
- p += ETH_GSTRING_LEN;
- num_strings++;
- break;
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
}
- sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
- fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
- p += ETH_GSTRING_LEN;
- num_strings++;
- sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask);
- p += ETH_GSTRING_LEN;
- num_strings++;
- if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
- sprintf(p, "\tAction: Drop\n");
- else
- sprintf(p, "\tAction: Direct to queue %d\n",
- fsc->fs.action);
- p += ETH_GSTRING_LEN;
- num_strings++;
-unknown_filter:
- i++;
}
-copy:
- /* indicate to userspace how many strings we actually have */
- gstrings.len = num_strings;
- ret = -EFAULT;
- if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
- goto out;
- useraddr += sizeof(gstrings);
- if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
- goto out;
- ret = 0;
+
+ ret = ops->set_rxfh(dev, indir, hkey);
out:
- kfree(data);
+ kfree(rss_config);
return ret;
}
@@ -817,8 +859,8 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vmalloc(reglen);
- if (!regbuf)
+ regbuf = vzalloc(reglen);
+ if (reglen && !regbuf)
return -ENOMEM;
ops->get_regs(dev, &regs, regbuf);
@@ -827,7 +869,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (copy_to_user(useraddr, &regs, sizeof(regs)))
goto out;
useraddr += offsetof(struct ethtool_regs, data);
- if (copy_to_user(useraddr, regbuf, regs.len))
+ if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
goto out;
ret = 0;
@@ -883,6 +925,40 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
return dev->ethtool_ops->set_wol(dev, &wol);
}
+static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+ int rc;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+
+ memset(&edata, 0, sizeof(struct ethtool_eee));
+ edata.cmd = ETHTOOL_GEEE;
+ rc = dev->ethtool_ops->get_eee(dev, &edata);
+
+ if (rc)
+ return rc;
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+
+ if (!dev->ethtool_ops->set_eee)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_eee(dev, &edata);
+}
+
static int ethtool_nway_reset(struct net_device *dev)
{
if (!dev->ethtool_ops->nway_reset)
@@ -891,18 +967,31 @@ static int ethtool_nway_reset(struct net_device *dev)
return dev->ethtool_ops->nway_reset(dev);
}
-static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
+ int (*getter)(struct net_device *,
+ struct ethtool_eeprom *, u8 *),
+ u32 total_len)
{
struct ethtool_eeprom eeprom;
- const struct ethtool_ops *ops = dev->ethtool_ops;
void __user *userbuf = useraddr + sizeof(eeprom);
u32 bytes_remaining;
u8 *data;
int ret = 0;
- if (!ops->get_eeprom || !ops->get_eeprom_len)
- return -EOPNOTSUPP;
-
if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
return -EFAULT;
@@ -911,7 +1000,7 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
return -EINVAL;
/* Check for exceeding total eeprom len */
- if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ if (eeprom.offset + eeprom.len > total_len)
return -EINVAL;
data = kmalloc(PAGE_SIZE, GFP_USER);
@@ -922,7 +1011,7 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
while (bytes_remaining > 0) {
eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
- ret = ops->get_eeprom(dev, &eeprom, data);
+ ret = getter(dev, &eeprom, data);
if (ret)
break;
if (copy_to_user(userbuf, data, eeprom.len)) {
@@ -943,6 +1032,17 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len)
+ return -EOPNOTSUPP;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
+ ops->get_eeprom_len(dev));
+}
+
static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
{
struct ethtool_eeprom eeprom;
@@ -1046,190 +1146,60 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_ringparam(dev, &ringparam);
}
-static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
- if (!dev->ethtool_ops->get_pauseparam)
+ if (!dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
- dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+ dev->ethtool_ops->get_channels(dev, &channels);
- if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+ if (copy_to_user(useraddr, &channels, sizeof(channels)))
return -EFAULT;
return 0;
}
-static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
-{
- struct ethtool_pauseparam pauseparam;
-
- if (!dev->ethtool_ops->set_pauseparam)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
- return -EFAULT;
-
- return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
-}
-
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!data && dev->ethtool_ops->set_tso) {
- err = dev->ethtool_ops->set_tso(dev, 0);
- if (err)
- return err;
- }
-
- if (!data && dev->ethtool_ops->set_ufo) {
- err = dev->ethtool_ops->set_ufo(dev, 0);
- if (err)
- return err;
- }
- return dev->ethtool_ops->set_sg(dev, data);
-}
-
-static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
- int err;
-
- if (!dev->ethtool_ops->set_tx_csum)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg) {
- err = __ethtool_set_sg(dev, 0);
- if (err)
- return err;
- }
-
- return dev->ethtool_ops->set_tx_csum(dev, edata.data);
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_rx_csum)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg)
- dev->features &= ~NETIF_F_GRO;
-
- return dev->ethtool_ops->set_rx_csum(dev, edata.data);
-}
-
-static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data &&
- !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- return __ethtool_set_sg(dev, edata.data);
-}
-
-static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_value edata;
+ struct ethtool_channels channels;
- if (!dev->ethtool_ops->set_tso)
+ if (!dev->ethtool_ops->set_channels)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
-
- return dev->ethtool_ops->set_tso(dev, edata.data);
+ return dev->ethtool_ops->set_channels(dev, &channels);
}
-static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata;
+ struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
- if (!dev->ethtool_ops->set_ufo)
+ if (!dev->ethtool_ops->get_pauseparam)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
- if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) ||
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
- == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
- return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, edata.data);
-}
-static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGSO };
-
- edata.data = dev->features & NETIF_F_GSO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
+ dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
return -EFAULT;
- if (edata.data)
- dev->features |= NETIF_F_GSO;
- else
- dev->features &= ~NETIF_F_GSO;
return 0;
}
-static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GGRO };
-
- edata.data = dev->features & NETIF_F_GRO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
+ struct ethtool_pauseparam pauseparam;
-static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
+ if (!dev->ethtool_ops->set_pauseparam)
+ return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
return -EFAULT;
- if (edata.data) {
- u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum(dev) :
- ethtool_op_get_rx_csum(dev);
-
- if (!rxcsum)
- return -EINVAL;
- dev->features |= NETIF_F_GRO;
- } else
- dev->features &= ~NETIF_F_GRO;
-
- return 0;
+ return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -1273,17 +1243,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gstrings gstrings;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u8 *data;
int ret;
- if (!ops->get_strings || !ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
return -EFAULT;
- ret = ops->get_sset_count(dev, gstrings.string_set);
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
if (ret < 0)
return ret;
@@ -1293,7 +1259,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
if (!data)
return -ENOMEM;
- ops->get_strings(dev, gstrings.string_set, data);
+ __ethtool_get_strings(dev, gstrings.string_set, data);
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1303,7 +1269,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
goto out;
ret = 0;
- out:
+out:
kfree(data);
return ret;
}
@@ -1311,14 +1277,61 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
{
struct ethtool_value id;
+ static bool busy;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int rc;
- if (!dev->ethtool_ops->phys_id)
+ if (!ops->set_phys_id)
return -EOPNOTSUPP;
+ if (busy)
+ return -EBUSY;
+
if (copy_from_user(&id, useraddr, sizeof(id)))
return -EFAULT;
- return dev->ethtool_ops->phys_id(dev, id.data);
+ rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+ if (rc < 0)
+ return rc;
+
+ /* Drop the RTNL lock while waiting, but prevent reentry or
+ * removal of the device.
+ */
+ busy = true;
+ dev_hold(dev);
+ rtnl_unlock();
+
+ if (rc == 0) {
+ /* Driver will handle this itself */
+ schedule_timeout_interruptible(
+ id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+ } else {
+ /* Driver expects to be called at twice the frequency in rc */
+ int n = rc * 2, i, interval = HZ / n;
+
+ /* Count down seconds */
+ do {
+ /* Count down iterations per second */
+ i = n;
+ do {
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && --i != 0);
+ } while (!signal_pending(current) &&
+ (id.data == 0 || --id.data != 0));
+ }
+
+ rtnl_lock();
+ dev_put(dev);
+ busy = false;
+
+ (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+ return rc;
}
static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
@@ -1433,10 +1446,182 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
if (!dev->ethtool_ops->flash_device)
return -EOPNOTSUPP;
+ efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+
return dev->ethtool_ops->flash_device(dev, &efl);
}
-/* The main entry point in this file. Called from net/core/dev.c */
+static int ethtool_set_dump(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_dump dump;
+
+ if (!dev->ethtool_ops->set_dump)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_dump(dev, &dump);
+}
+
+static int ethtool_get_dump_flag(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_dump dump;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ ret = ops->get_dump_flag(dev, &dump);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_dump_data(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ __u32 len;
+ struct ethtool_dump dump, tmp;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data = NULL;
+
+ if (!ops->get_dump_data || !ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+ ret = ops->get_dump_flag(dev, &tmp);
+ if (ret)
+ return ret;
+
+ len = min(tmp.len, dump.len);
+ if (!len)
+ return -EFAULT;
+
+ /* Don't ever let the driver think there's more space available
+ * than it requested with .get_dump_flag().
+ */
+ dump.len = len;
+
+ /* Always allocate enough space to hold the whole thing so that the
+ * driver does not need to check the length and bother with partial
+ * dumping.
+ */
+ data = vzalloc(tmp.len);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_dump_data(dev, &dump, data);
+ if (ret)
+ goto out;
+
+ /* There are two sane possibilities:
+ * 1. The driver's .get_dump_data() does not touch dump.len.
+ * 2. Or it may set dump.len to how much it really writes, which
+ * should be tmp.len (or len if it can do a partial dump).
+ * In any case respond to userspace with the actual length of data
+ * it's receiving.
+ */
+ WARN_ON(dump.len != len && dump.len != tmp.len);
+ dump.len = len;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ useraddr += offsetof(struct ethtool_dump, data);
+ if (copy_to_user(useraddr, data, len))
+ ret = -EFAULT;
+out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_ts_info info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phydev && phydev->drv && phydev->drv->ts_info) {
+ err = phydev->drv->ts_info(phydev, &info);
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, &info);
+ } else {
+ info.so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info.phc_index = -1;
+ }
+
+ if (err)
+ return err;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int ethtool_get_module_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_module_info)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
+ return -EFAULT;
+
+ ret = ops->get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &modinfo, sizeof(modinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_get_module_eeprom(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_module_info || !ops->get_module_eeprom)
+ return -EOPNOTSUPP;
+
+ ret = ops->get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom,
+ modinfo.eeprom_len);
+}
+
+/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
@@ -1444,7 +1629,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
void __user *useraddr = ifr->ifr_data;
u32 ethcmd;
int rc;
- unsigned long old_features;
+ netdev_features_t old_features;
if (!dev || !netif_device_present(dev))
return -ENODEV;
@@ -1452,28 +1637,21 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
- if (!dev->ethtool_ops) {
- /* ETHTOOL_GDRVINFO does not require any driver support.
- * It is also unprivileged and does not change anything,
- * so we can take a shortcut to it. */
- if (ethcmd == ETHTOOL_GDRVINFO)
- return ethtool_get_drvinfo(dev, useraddr);
- else
- return -EOPNOTSUPP;
- }
-
/* Allow some commands to be done by anyone */
switch (ethcmd) {
case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GLINK:
case ETHTOOL_GCOALESCE:
case ETHTOOL_GRINGPARAM:
case ETHTOOL_GPAUSEPARAM:
case ETHTOOL_GRXCSUM:
case ETHTOOL_GTXCSUM:
case ETHTOOL_GSG:
+ case ETHTOOL_GSSET_INFO:
case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GSTATS:
case ETHTOOL_GTSO:
case ETHTOOL_GPERMADDR:
case ETHTOOL_GUFO:
@@ -1486,9 +1664,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
+ case ETHTOOL_GFEATURES:
+ case ETHTOOL_GCHANNELS:
+ case ETHTOOL_GET_TS_INFO:
+ case ETHTOOL_GEEE:
break;
default:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
}
@@ -1526,12 +1710,17 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
rc = ethtool_set_value_void(dev, useraddr,
dev->ethtool_ops->set_msglevel);
break;
+ case ETHTOOL_GEEE:
+ rc = ethtool_get_eee(dev, useraddr);
+ break;
+ case ETHTOOL_SEEE:
+ rc = ethtool_set_eee(dev, useraddr);
+ break;
case ETHTOOL_NWAY_RST:
rc = ethtool_nway_reset(dev);
break;
case ETHTOOL_GLINK:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- dev->ethtool_ops->get_link);
+ rc = ethtool_get_link(dev, useraddr);
break;
case ETHTOOL_GEEPROM:
rc = ethtool_get_eeprom(dev, useraddr);
@@ -1557,42 +1746,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SPAUSEPARAM:
rc = ethtool_set_pauseparam(dev, useraddr);
break;
- case ETHTOOL_GRXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum :
- ethtool_op_get_rx_csum));
- break;
- case ETHTOOL_SRXCSUM:
- rc = ethtool_set_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_GTXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tx_csum ?
- dev->ethtool_ops->get_tx_csum :
- ethtool_op_get_tx_csum));
- break;
- case ETHTOOL_STXCSUM:
- rc = ethtool_set_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_GSG:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_sg ?
- dev->ethtool_ops->get_sg :
- ethtool_op_get_sg));
- break;
- case ETHTOOL_SSG:
- rc = ethtool_set_sg(dev, useraddr);
- break;
- case ETHTOOL_GTSO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tso ?
- dev->ethtool_ops->get_tso :
- ethtool_op_get_tso));
- break;
- case ETHTOOL_STSO:
- rc = ethtool_set_tso(dev, useraddr);
- break;
case ETHTOOL_TEST:
rc = ethtool_self_test(dev, useraddr);
break;
@@ -1608,30 +1761,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPERMADDR:
rc = ethtool_get_perm_addr(dev, useraddr);
break;
- case ETHTOOL_GUFO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_ufo ?
- dev->ethtool_ops->get_ufo :
- ethtool_op_get_ufo));
- break;
- case ETHTOOL_SUFO:
- rc = ethtool_set_ufo(dev, useraddr);
- break;
- case ETHTOOL_GGSO:
- rc = ethtool_get_gso(dev, useraddr);
- break;
- case ETHTOOL_SGSO:
- rc = ethtool_set_gso(dev, useraddr);
- break;
case ETHTOOL_GFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_flags ?
- dev->ethtool_ops->get_flags :
- ethtool_op_get_flags));
+ __ethtool_get_flags);
break;
case ETHTOOL_SFLAGS:
- rc = ethtool_set_value(dev, useraddr,
- dev->ethtool_ops->set_flags);
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
break;
case ETHTOOL_GPFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
@@ -1653,24 +1788,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXCLSRLINS:
rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
break;
- case ETHTOOL_GGRO:
- rc = ethtool_get_gro(dev, useraddr);
- break;
- case ETHTOOL_SGRO:
- rc = ethtool_set_gro(dev, useraddr);
- break;
case ETHTOOL_FLASHDEV:
rc = ethtool_flash_device(dev, useraddr);
break;
case ETHTOOL_RESET:
rc = ethtool_reset(dev, useraddr);
break;
- case ETHTOOL_SRXNTUPLE:
- rc = ethtool_set_rx_ntuple(dev, useraddr);
- break;
- case ETHTOOL_GRXNTUPLE:
- rc = ethtool_get_rx_ntuple(dev, useraddr);
- break;
case ETHTOOL_GSSET_INFO:
rc = ethtool_get_sset_info(dev, useraddr);
break;
@@ -1680,6 +1803,60 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXFHINDIR:
rc = ethtool_set_rxfh_indir(dev, useraddr);
break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SUFO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SCHANNELS:
+ rc = ethtool_set_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SET_DUMP:
+ rc = ethtool_set_dump(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_FLAG:
+ rc = ethtool_get_dump_flag(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_DATA:
+ rc = ethtool_get_dump_data(dev, useraddr);
+ break;
+ case ETHTOOL_GET_TS_INFO:
+ rc = ethtool_get_ts_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEINFO:
+ rc = ethtool_get_module_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEEEPROM:
+ rc = ethtool_get_module_eeprom(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 82a4369ae15..185c341fafb 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
@@ -32,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->flags = flags;
r->fr_net = hold_net(ops->fro_net);
+ r->suppress_prefixlen = -1;
+ r->suppress_ifgroup = -1;
+
/* The lock is not required here, the list in unreacheable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
@@ -150,6 +154,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
list_del_rcu(&rule->list);
+ if (ops->delete)
+ ops->delete(rule);
fib_rule_put(rule);
}
}
@@ -181,14 +187,13 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
{
int ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->iif) &&
- !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF))
+ if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->oif))
+ if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
goto out;
- if ((rule->mark ^ fl->mark) & rule->mark_mask)
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
ret = ops->match(rule, fl, flags);
@@ -224,6 +229,9 @@ jumped:
else
err = ops->action(rule, fl, flags, arg);
+ if (!err && ops->suppress && ops->suppress(rule, arg))
+ continue;
+
if (err != -EAGAIN) {
if ((arg->flags & FIB_LOOKUP_NOREF) ||
likely(atomic_inc_not_zero(&rule->refcnt))) {
@@ -264,7 +272,7 @@ errout:
return err;
}
-static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -335,6 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
+ if (tb[FRA_SUPPRESS_PREFIXLEN])
+ rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ else
+ rule->suppress_prefixlen = -1;
+
+ if (tb[FRA_SUPPRESS_IFGROUP])
+ rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ else
+ rule->suppress_ifgroup = -1;
if (!tb[FRA_PRIORITY] && ops->default_pref)
rule->pref = ops->default_pref(ops);
@@ -385,8 +402,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
*/
list_for_each_entry(r, &ops->rules_list, list) {
if (r->action == FR_ACT_GOTO &&
- r->target == rule->pref) {
- BUG_ON(rtnl_dereference(r->ctarget) != NULL);
+ r->target == rule->pref &&
+ rtnl_dereference(r->ctarget) == NULL) {
rcu_assign_pointer(r->ctarget, rule);
if (--ops->unresolved_rules == 0)
break;
@@ -400,7 +417,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (unresolved)
ops->unresolved_rules++;
- notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
@@ -413,7 +430,7 @@ errout:
return err;
}
-static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -443,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (frh->action && (frh->action != rule->action))
continue;
- if (frh->table && (frh_get_table(frh, tb) != rule->table))
+ if (frh_get_table(frh, tb) &&
+ (frh_get_table(frh, tb) != rule->table))
continue;
if (tb[FRA_PRIORITY] &&
@@ -476,8 +494,11 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
list_del_rcu(&rule->list);
- if (rule->action == FR_ACT_GOTO)
+ if (rule->action == FR_ACT_GOTO) {
ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
/*
* Check if this rule is a target to any of them. If so,
@@ -488,14 +509,16 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (ops->nr_goto_rules > 0) {
list_for_each_entry(tmp, &ops->rules_list, list) {
if (rtnl_dereference(tmp->ctarget) == rule) {
- rcu_assign_pointer(tmp->ctarget, NULL);
+ RCU_INIT_POINTER(tmp->ctarget, NULL);
ops->unresolved_rules++;
}
}
}
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).pid);
+ NETLINK_CB(skb).portid);
+ if (ops->delete)
+ ops->delete(rule);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -516,6 +539,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ nla_total_size(4) /* FRA_PRIORITY */
+ nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4); /* FRA_FWMASK */
@@ -539,41 +564,47 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh = nlmsg_data(nlh);
frh->family = ops->family;
frh->table = rule->table;
- NLA_PUT_U32(skb, FRA_TABLE, rule->table);
+ if (nla_put_u32(skb, FRA_TABLE, rule->table))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
+ goto nla_put_failure;
frh->res1 = 0;
frh->res2 = 0;
frh->action = rule->action;
frh->flags = rule->flags;
if (rule->action == FR_ACT_GOTO &&
- rcu_dereference_raw(rule->ctarget) == NULL)
+ rcu_access_pointer(rule->ctarget) == NULL)
frh->flags |= FIB_RULE_UNRESOLVED;
if (rule->iifname[0]) {
- NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname);
-
+ if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
+ goto nla_put_failure;
if (rule->iifindex == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
- NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname);
-
+ if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
+ goto nla_put_failure;
if (rule->oifindex == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
- if (rule->pref)
- NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
-
- if (rule->mark)
- NLA_PUT_U32(skb, FRA_FWMARK, rule->mark);
-
- if (rule->mark_mask || rule->mark)
- NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask);
+ if ((rule->pref &&
+ nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
+ (rule->mark &&
+ nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
+ ((rule->mark_mask || rule->mark) &&
+ nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
+ (rule->target &&
+ nla_put_u32(skb, FRA_GOTO, rule->target)))
+ goto nla_put_failure;
- if (rule->target)
- NLA_PUT_U32(skb, FRA_GOTO, rule->target);
+ if (rule->suppress_ifgroup != -1) {
+ if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+ goto nla_put_failure;
+ }
if (ops->fill(rule, skb, frh) < 0)
goto nla_put_failure;
@@ -591,17 +622,19 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
int idx = 0;
struct fib_rule *rule;
- list_for_each_entry(rule, &ops->rules_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
if (idx < cb->args[1])
goto skip;
- if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
+ if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWRULE,
NLM_F_MULTI, ops) < 0)
break;
skip:
idx++;
}
+ rcu_read_unlock();
cb->args[1] = idx;
rules_ops_put(ops);
@@ -698,9 +731,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
static int fib_rules_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+ void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct fib_rules_ops *ops;
@@ -712,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,
attach_rules(&ops->rules_list, dev);
break;
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
+ break;
+
case NETDEV_UNREGISTER:
list_for_each_entry(ops, &net->rules_ops, list)
detach_rules(&ops->rules_list, dev);
@@ -739,9 +779,9 @@ static struct pernet_operations fib_rules_net_ops = {
static int __init fib_rules_init(void)
{
int err;
- rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule);
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index 25500f16a18..1dbf6462f76 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
/*
* Linux Socket Filter - Kernel level socket filtering
*
- * Author:
- * Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
*
- * Based on the design of:
- * - The Berkeley Packet Filter
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -33,62 +38,39 @@
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
#include <linux/filter.h>
-#include <linux/reciprocal_div.h>
-
-enum {
- BPF_S_RET_K = 1,
- BPF_S_RET_A,
- BPF_S_ALU_ADD_K,
- BPF_S_ALU_ADD_X,
- BPF_S_ALU_SUB_K,
- BPF_S_ALU_SUB_X,
- BPF_S_ALU_MUL_K,
- BPF_S_ALU_MUL_X,
- BPF_S_ALU_DIV_X,
- BPF_S_ALU_AND_K,
- BPF_S_ALU_AND_X,
- BPF_S_ALU_OR_K,
- BPF_S_ALU_OR_X,
- BPF_S_ALU_LSH_K,
- BPF_S_ALU_LSH_X,
- BPF_S_ALU_RSH_K,
- BPF_S_ALU_RSH_X,
- BPF_S_ALU_NEG,
- BPF_S_LD_W_ABS,
- BPF_S_LD_H_ABS,
- BPF_S_LD_B_ABS,
- BPF_S_LD_W_LEN,
- BPF_S_LD_W_IND,
- BPF_S_LD_H_IND,
- BPF_S_LD_B_IND,
- BPF_S_LD_IMM,
- BPF_S_LDX_W_LEN,
- BPF_S_LDX_B_MSH,
- BPF_S_LDX_IMM,
- BPF_S_MISC_TAX,
- BPF_S_MISC_TXA,
- BPF_S_ALU_DIV_K,
- BPF_S_LD_MEM,
- BPF_S_LDX_MEM,
- BPF_S_ST,
- BPF_S_STX,
- BPF_S_JMP_JA,
- BPF_S_JMP_JEQ_K,
- BPF_S_JMP_JEQ_X,
- BPF_S_JMP_JGE_K,
- BPF_S_JMP_JGE_X,
- BPF_S_JMP_JGT_K,
- BPF_S_JMP_JGT_X,
- BPF_S_JMP_JSET_K,
- BPF_S_JMP_JSET_X,
-};
-
-/* No hurry in this branch */
-static void *__load_pointer(const struct sk_buff *skb, int k)
+#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
+#include <linux/if_vlan.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
u8 *ptr = NULL;
@@ -96,9 +78,9 @@ static void *__load_pointer(const struct sk_buff *skb, int k)
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
- if (ptr >= skb->head && ptr < skb_tail_pointer(skb))
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
+
return NULL;
}
@@ -107,11 +89,8 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,
{
if (k >= 0)
return skb_header_pointer(skb, k, size, buffer);
- else {
- if (k >= SKF_AD_OFF)
- return NULL;
- return __load_pointer(skb, k);
- }
+
+ return bpf_internal_load_pointer_neg_helper(skb, k, size);
}
/**
@@ -131,344 +110,1032 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
int err;
struct sk_filter *filter;
+ /*
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
+ */
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ return -ENOMEM;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter) {
- unsigned int pkt_len = sk_run_filter(skb, filter->insns);
+ unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(sk_filter);
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
/**
- * sk_run_filter - run a filter on a socket
- * @skb: buffer to run the filter on
- * @filter: filter to apply
+ * __sk_run_filter - run a filter on a given context
+ * @ctx: buffer to run the filter on
+ * @insn: filter to apply
*
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. @skb is the data we are
- * filtering, @filter is the array of filter instructions.
- * Because all jumps are guaranteed to be before last instruction,
- * and last instruction guaranteed to be a RET, we dont need to check
- * flen. (We used to pass to this function the length of filter)
+ * Decode and apply filter instructions to the skb->data. Return length to
+ * keep, 0 for none. @ctx is the data we are operating on, @insn is the
+ * array of filter instructions.
*/
-unsigned int sk_run_filter(const struct sk_buff *skb,
- const struct sock_filter *fentry)
+static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ };
void *ptr;
- u32 A = 0; /* Accumulator */
- u32 X = 0; /* Index Register */
- u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
- u32 tmp;
- int k;
+ int off;
- /*
- * Process array of filter instructions.
- */
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
-#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
- switch (fentry->code) {
- case BPF_S_ALU_ADD_X:
- A += X;
- continue;
- case BPF_S_ALU_ADD_K:
- A += K;
- continue;
- case BPF_S_ALU_SUB_X:
- A -= X;
- continue;
- case BPF_S_ALU_SUB_K:
- A -= K;
- continue;
- case BPF_S_ALU_MUL_X:
- A *= X;
- continue;
- case BPF_S_ALU_MUL_K:
- A *= K;
- continue;
- case BPF_S_ALU_DIV_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_S_ALU_DIV_K:
- A = reciprocal_divide(A, K);
- continue;
- case BPF_S_ALU_AND_X:
- A &= X;
- continue;
- case BPF_S_ALU_AND_K:
- A &= K;
- continue;
- case BPF_S_ALU_OR_X:
- A |= X;
- continue;
- case BPF_S_ALU_OR_K:
- A |= K;
- continue;
- case BPF_S_ALU_LSH_X:
- A <<= X;
- continue;
- case BPF_S_ALU_LSH_K:
- A <<= K;
- continue;
- case BPF_S_ALU_RSH_X:
- A >>= X;
- continue;
- case BPF_S_ALU_RSH_K:
- A >>= K;
- continue;
- case BPF_S_ALU_NEG:
- A = -A;
- continue;
- case BPF_S_JMP_JA:
- fentry += K;
- continue;
- case BPF_S_JMP_JGT_K:
- fentry += (A > K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_K:
- fentry += (A >= K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_K:
- fentry += (A == K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_K:
- fentry += (A & K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGT_X:
- fentry += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_X:
- fentry += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_X:
- fentry += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_X:
- fentry += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_LD_W_ABS:
- k = K;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be32(ptr);
- continue;
- }
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
break;
- case BPF_S_LD_H_ABS:
- k = K;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be16(ptr);
- continue;
- }
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
break;
- case BPF_S_LD_B_ABS:
- k = K;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
- }
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
break;
- case BPF_S_LD_W_LEN:
- A = skb->len;
- continue;
- case BPF_S_LDX_W_LEN:
- X = skb->len;
- continue;
- case BPF_S_LD_W_IND:
- k = X + K;
- goto load_w;
- case BPF_S_LD_H_IND:
- k = X + K;
- goto load_h;
- case BPF_S_LD_B_IND:
- k = X + K;
- goto load_b;
- case BPF_S_LDX_B_MSH:
- ptr = load_pointer(skb, K, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
- }
- return 0;
- case BPF_S_LD_IMM:
- A = K;
- continue;
- case BPF_S_LDX_IMM:
- X = K;
- continue;
- case BPF_S_LD_MEM:
- A = mem[K];
- continue;
- case BPF_S_LDX_MEM:
- X = mem[K];
- continue;
- case BPF_S_MISC_TAX:
- X = A;
- continue;
- case BPF_S_MISC_TXA:
- A = X;
- continue;
- case BPF_S_RET_K:
- return K;
- case BPF_S_RET_A:
- return A;
- case BPF_S_ST:
- mem[K] = A;
- continue;
- case BPF_S_STX:
- mem[K] = X;
- continue;
- default:
- WARN_ON(1);
- return 0;
}
+ CONT;
- /*
- * Handle ancillary data, which are impossible
- * (or very difficult) to get parsing packet contents.
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
*/
- switch (k-SKF_AD_OFF) {
- case SKF_AD_PROTOCOL:
- A = ntohs(skb->protocol);
- continue;
- case SKF_AD_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case SKF_AD_IFINDEX:
- if (!skb->dev)
- return 0;
- A = skb->dev->ifindex;
- continue;
- case SKF_AD_MARK:
- A = skb->mark;
- continue;
- case SKF_AD_QUEUE:
- A = skb->queue_mapping;
- continue;
- case SKF_AD_HATYPE:
- if (!skb->dev)
- return 0;
- A = skb->dev->type;
- continue;
- case SKF_AD_RXHASH:
- A = skb->rxhash;
- continue;
- case SKF_AD_CPU:
- A = raw_smp_processor_id();
- continue;
- case SKF_AD_NLATTR: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = nla_find((struct nlattr *)&skb->data[A],
- skb->len - A, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+/* Helper to find the offset of pkt_type in sk_buff structure. We want
+ * to make sure its still a 3bit field starting at a byte boundary;
+ * taken from arch/x86/net/bpf_jit_comp.c.
+ */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX (7 << 5)
+#else
+#define PKT_TYPE_MAX 7
+#endif
+static unsigned int pkt_type_offset(void)
+{
+ struct sk_buff skb_probe = { .pkt_type = ~0, };
+ u8 *ct = (u8 *) &skb_probe;
+ unsigned int off;
+
+ for (off = 0; off < sizeof(struct sk_buff); off++) {
+ if (ct[off] == PKT_TYPE_MAX)
+ return off;
+ }
+
+ pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
+ return -1;
+}
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct sock_filter_int **insnp)
+{
+ struct sock_filter_int *insn = *insnp;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ pkt_type_offset());
+ if (insn->off < 0)
+ return false;
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
+#endif
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_tci));
+ if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* A >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
+ /* A &= 1 */
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
}
- case SKF_AD_NLATTR_NEST: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = (struct nlattr *)&skb->data[A];
- if (nla->nla_len > A - skb->len)
- return 0;
-
- nla = nla_find_nested(nla, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(__get_random_u32);
+ break;
}
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * sk_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: buffer where converted program will be stored
+ * @new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
+ * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
+ */
+int sk_convert_filter(struct sock_filter *prog, int len,
+ struct sock_filter_int *new_prog, int *new_len)
+{
+ int new_flen = 0, pass = 0, target, i;
+ struct sock_filter_int *new_insn;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = new_prog;
+ fp = prog;
+
+ if (new_insn)
+ *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ new_insn++;
+
+ for (i = 0; i < len; fp++, i++) {
+ struct sock_filter_int tmp_insns[6] = { };
+ struct sock_filter_int *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - new_prog;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ if (target >= len || target < 0) \
+ goto err; \
+ insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ insn->off -= insn - tmp_insns; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_X;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
+ }
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert JEQ into JNE when 'jump_true' is next insn. */
+ if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B:
+ /* tmp = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+
+ /* RET_K, RET_A are remaped into 2 insns. */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+ BPF_K : BPF_X, BPF_REG_0,
+ BPF_REG_A, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
+ case BPF_ST:
+ case BPF_STX:
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unkown instruction. */
default:
- return 0;
+ goto err;
}
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - new_prog;
+ return 0;
+ }
+
+ pass++;
+ if (new_flen != new_insn - new_prog) {
+ new_flen = new_insn - new_prog;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
}
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
}
-EXPORT_SYMBOL(sk_run_filter);
-/*
- * Security :
+/* Security:
+ *
* A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ *
* As we dont want to clear mem[] array for each packet going through
* sk_run_filter(), we check that filter loaded by user never try to read
* a cell if not previously written, and we check all branches to be sure
- * a malicious user doesnt try to abuse us.
+ * a malicious user doesn't try to abuse us.
*/
static int check_load_and_stores(struct sock_filter *filter, int flen)
{
- u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
int pc, ret = 0;
BUILD_BUG_ON(BPF_MEMWORDS > 16);
- masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
if (!masks)
return -ENOMEM;
+
memset(masks, 0xff, flen * sizeof(*masks));
for (pc = 0; pc < flen; pc++) {
memvalid &= masks[pc];
switch (filter[pc].code) {
- case BPF_S_ST:
- case BPF_S_STX:
+ case BPF_ST:
+ case BPF_STX:
memvalid |= (1 << filter[pc].k);
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
if (!(memvalid & (1 << filter[pc].k))) {
ret = -EINVAL;
goto error;
}
break;
- case BPF_S_JMP_JA:
- /* a jump must set masks on target */
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
masks[pc + 1 + filter[pc].k] &= memvalid;
memvalid = ~0;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* a jump must set masks on targets */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
masks[pc + 1 + filter[pc].jt] &= memvalid;
masks[pc + 1 + filter[pc].jf] &= memvalid;
memvalid = ~0;
@@ -480,6 +1147,72 @@ error:
return ret;
}
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
/**
* sk_chk_filter - verify socket filter code
* @filter: filter to verify
@@ -494,146 +1227,354 @@ error:
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, int flen)
+int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
{
- /*
- * Valid instructions are initialized to non-0.
- * Invalid instructions are initialized to 0.
- */
- static const u8 codes[] = {
- [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K,
- [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X,
- [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K,
- [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X,
- [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
- [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
- [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
- [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
- [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
- [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
- [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
- [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
- [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
- [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
- [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X,
- [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG,
- [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS,
- [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS,
- [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS,
- [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN,
- [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND,
- [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND,
- [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND,
- [BPF_LD|BPF_IMM] = BPF_S_LD_IMM,
- [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN,
- [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH,
- [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM,
- [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX,
- [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA,
- [BPF_RET|BPF_K] = BPF_S_RET_K,
- [BPF_RET|BPF_A] = BPF_S_RET_A,
- [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K,
- [BPF_LD|BPF_MEM] = BPF_S_LD_MEM,
- [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM,
- [BPF_ST] = BPF_S_ST,
- [BPF_STX] = BPF_S_STX,
- [BPF_JMP|BPF_JA] = BPF_S_JMP_JA,
- [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K,
- [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X,
- [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K,
- [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X,
- [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K,
- [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X,
- [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
- [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
- };
+ bool anc_found;
int pc;
if (flen == 0 || flen > BPF_MAXINSNS)
return -EINVAL;
- /* check the filter code now */
+ /* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
struct sock_filter *ftest = &filter[pc];
- u16 code = ftest->code;
- if (code >= ARRAY_SIZE(codes))
- return -EINVAL;
- code = codes[code];
- if (!code)
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
return -EINVAL;
+
/* Some instructions need special checks */
- switch (code) {
- case BPF_S_ALU_DIV_K:
- /* check for division by zero */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
if (ftest->k == 0)
return -EINVAL;
- ftest->k = reciprocal_value(ftest->k);
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
- case BPF_S_ST:
- case BPF_S_STX:
- /* check for invalid memory addresses */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* Check for invalid memory addresses */
if (ftest->k >= BPF_MEMWORDS)
return -EINVAL;
break;
- case BPF_S_JMP_JA:
- /*
- * Note, the large ftest->k might cause loops.
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
* Compare this with conditional jumps below,
* where offsets are limited. --ANK (981016)
*/
- if (ftest->k >= (unsigned)(flen-pc-1))
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
return -EINVAL;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* for conditionals both must be safe */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
if (pc + ftest->jt + 1 >= flen ||
pc + ftest->jf + 1 >= flen)
return -EINVAL;
break;
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ anc_found = false;
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
+ if (anc_found == false && ftest->k >= SKF_AD_OFF)
+ return -EINVAL;
}
- ftest->code = code;
}
- /* last instruction must be a RET code */
+ /* Last instruction must be a RET code */
switch (filter[flen - 1].code) {
- case BPF_S_RET_K:
- case BPF_S_RET_A:
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
return check_load_and_stores(filter, flen);
}
+
return -EINVAL;
}
EXPORT_SYMBOL(sk_chk_filter);
+static int sk_store_orig_filter(struct sk_filter *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = sk_filter_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+ fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void sk_release_orig_filter(struct sk_filter *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
/**
- * sk_filter_rcu_release - Release a socket filter by rcu_head
+ * sk_filter_release_rcu - Release a socket filter by rcu_head
* @rcu: rcu_head that contains the sk_filter to free
*/
-static void sk_filter_rcu_release(struct rcu_head *rcu)
+static void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+ sk_release_orig_filter(fp);
+ sk_filter_free(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (atomic_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
sk_filter_release(fp);
}
-static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
+void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_inc(&fp->refcnt);
+ atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+}
+
+static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
+ struct sock *sk,
+ unsigned int len)
+{
+ struct sk_filter *fp_new;
+
+ if (sk == NULL)
+ return krealloc(fp, len, GFP_KERNEL);
+
+ fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
+ if (fp_new) {
+ *fp_new = *fp;
+ /* As we're keeping orig_prog in fp_new along,
+ * we need to make sure we're not evicting it
+ * from the old fp.
+ */
+ fp->orig_prog = NULL;
+ sk_filter_uncharge(sk, fp);
+ }
+
+ return fp_new;
+}
+
+static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
+ struct sock *sk)
+{
+ struct sock_filter *old_prog;
+ struct sk_filter *old_fp;
+ int err, new_len, old_len = fp->len;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct sock_filter_int));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
+ err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ if (err)
+ /* 2nd sk_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by __sk_migrate_realloc().
+ */
+ goto out_err_free;
+
+ sk_filter_select_runtime(fp);
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ /* Rollback filter setup. */
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+}
+
+void __weak bpf_int_jit_compile(struct sk_filter *prog)
+{
+}
+
+/**
+ * sk_filter_select_runtime - select execution runtime for BPF program
+ * @fp: sk_filter populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via SK_RUN_FILTER() macro
+ */
+void sk_filter_select_runtime(struct sk_filter *fp)
+{
+ fp->bpf_func = (void *) __sk_run_filter;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
+
+/* free internal BPF program */
+void sk_filter_free(struct sk_filter *fp)
{
- unsigned int size = sk_filter_len(fp);
+ bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_free);
+
+static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
+ struct sock *sk)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = 0;
+
+ err = sk_chk_filter(fp->insns, fp->len);
+ if (err) {
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
+ bpf_jit_compile(fp);
- atomic_sub(size, &sk->sk_omem_alloc);
- call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = __sk_migrate_filter(fp, sk);
+
+ return fp;
}
/**
+ * sk_unattached_filter_create - create an unattached filter
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ *
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
+ */
+int sk_unattached_filter_create(struct sk_filter **pfp,
+ struct sock_fprog_kern *fprog)
+{
+ unsigned int fsize = sk_filter_proglen(fprog);
+ struct sk_filter *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ memcpy(fp->insns, fprog->filter, fsize);
+
+ atomic_set(&fp->refcnt, 1);
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, NULL);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
+
+void sk_unattached_filter_destroy(struct sk_filter *fp)
+{
+ sk_filter_release(fp);
+}
+EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
+
+/**
* sk_attach_filter - attach a socket filter
* @fprog: the filter program
* @sk: the socket to use
@@ -646,36 +1587,49 @@ static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ unsigned int fsize = sk_filter_proglen(fprog);
+ unsigned int sk_fsize = sk_filter_size(fprog->len);
int err;
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return -EINVAL;
- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+ fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
if (!fp)
return -ENOMEM;
+
if (copy_from_user(fp->insns, fprog->filter, fsize)) {
- sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+ sock_kfree_s(sk, fp, sk_fsize);
return -EFAULT;
}
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
- err = sk_chk_filter(fp->insns, fp->len);
+ err = sk_store_orig_filter(fp, fprog);
if (err) {
sk_filter_uncharge(sk, fp);
- return err;
+ return -ENOMEM;
}
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, sk);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
old_fp = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
rcu_assign_pointer(sk->sk_filter, fp);
if (old_fp)
- sk_filter_delayed_uncharge(sk, old_fp);
+ sk_filter_uncharge(sk, old_fp);
+
return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -685,13 +1639,57 @@ int sk_detach_filter(struct sock *sk)
int ret = -ENOENT;
struct sk_filter *filter;
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
filter = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
if (filter) {
- rcu_assign_pointer(sk->sk_filter, NULL);
- sk_filter_delayed_uncharge(sk, filter);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
ret = 0;
}
+
return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
+
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ int ret = 0;
+
+ lock_sock(sk);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ if (!filter)
+ goto out;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore.
+ */
+ fprog = filter->orig_prog;
+
+ ret = fprog->len;
+ if (!len)
+ /* User space only enquires number of filter blocks. */
+ goto out;
+
+ ret = -EINVAL;
+ if (len < fprog->len)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
+ goto out;
+
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
+out:
+ release_sock(sk);
+ return ret;
+}
diff --git a/net/core/flow.c b/net/core/flow.c
index 127c8a7ffd6..a0348fde1fd 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -22,14 +22,16 @@
#include <linux/cpumask.h>
#include <linux/mutex.h>
#include <net/flow.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
struct flow_cache_entry {
union {
struct hlist_node hlist;
struct list_head gc_list;
} u;
+ struct net *net;
u16 family;
u8 dir;
u32 genid;
@@ -37,37 +39,14 @@ struct flow_cache_entry {
struct flow_cache_object *object;
};
-struct flow_cache_percpu {
- struct hlist_head *hash_table;
- int hash_count;
- u32 hash_rnd;
- int hash_rnd_recalc;
- struct tasklet_struct flush_tasklet;
-};
-
struct flow_flush_info {
struct flow_cache *cache;
atomic_t cpuleft;
struct completion completion;
};
-struct flow_cache {
- u32 hash_shift;
- struct flow_cache_percpu __percpu *percpu;
- struct notifier_block hotcpu_notifier;
- int low_watermark;
- int high_watermark;
- struct timer_list rnd_timer;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-EXPORT_SYMBOL(flow_cache_genid);
-static struct flow_cache flow_cache_global;
static struct kmem_cache *flow_cachep __read_mostly;
-static DEFINE_SPINLOCK(flow_cache_gc_lock);
-static LIST_HEAD(flow_cache_gc_list);
-
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
@@ -83,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)
add_timer(&fc->rnd_timer);
}
-static int flow_entry_valid(struct flow_cache_entry *fle)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
- if (atomic_read(&flow_cache_genid) != fle->genid)
+ if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
return 0;
if (fle->object && !fle->object->ops->check(fle->object))
return 0;
return 1;
}
-static void flow_entry_kill(struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
if (fle->object)
fle->object->ops->delete(fle->object);
@@ -103,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)
{
struct list_head gc_list;
struct flow_cache_entry *fce, *n;
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
INIT_LIST_HEAD(&gc_list);
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail_init(&flow_cache_gc_list, &gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
- flow_entry_kill(fce);
+ flow_entry_kill(fce, xfrm);
}
-static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
- int deleted, struct list_head *gc_list)
+ int deleted, struct list_head *gc_list,
+ struct netns_xfrm *xfrm)
{
if (deleted) {
fcp->hash_count -= deleted;
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail(gc_list, &flow_cache_gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
- schedule_work(&flow_cache_gc_work);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+ schedule_work(&xfrm->flow_cache_gc_work);
}
}
@@ -131,17 +114,19 @@ static void __flow_cache_shrink(struct flow_cache *fc,
int shrink_to)
{
struct flow_cache_entry *fle;
- struct hlist_node *entry, *tmp;
+ struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
int saved = 0;
- hlist_for_each_entry_safe(fle, entry, tmp,
+ hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
if (saved < shrink_to &&
- flow_entry_valid(fle)) {
+ flow_entry_valid(fle, xfrm)) {
saved++;
} else {
deleted++;
@@ -151,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
}
static void flow_cache_shrink(struct flow_cache *fc,
@@ -172,31 +157,28 @@ static void flow_new_hash_rnd(struct flow_cache *fc,
static u32 flow_hash_code(struct flow_cache *fc,
struct flow_cache_percpu *fcp,
- struct flowi *key)
+ const struct flowi *key,
+ size_t keysize)
{
- u32 *k = (u32 *) key;
+ const u32 *k = (const u32 *) key;
+ const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
- return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+ return jhash2(k, length, fcp->hash_rnd)
& (flow_cache_hash_size(fc) - 1);
}
-typedef unsigned long flow_compare_t;
-
/* I hear what you're saying, use memcmp. But memcmp cannot make
- * important assumptions that we can here, such as alignment and
- * constant size.
+ * important assumptions that we can here, such as alignment.
*/
-static int flow_key_compare(struct flowi *key1, struct flowi *key2)
+static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
+ size_t keysize)
{
- flow_compare_t *k1, *k1_lim, *k2;
- const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
+ const flow_compare_t *k1, *k1_lim, *k2;
- BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));
+ k1 = (const flow_compare_t *) key1;
+ k1_lim = k1 + keysize;
- k1 = (flow_compare_t *) key1;
- k1_lim = k1 + n_elem;
-
- k2 = (flow_compare_t *) key2;
+ k2 = (const flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
@@ -207,14 +189,14 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
}
struct flow_cache_object *
-flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver, void *ctx)
{
- struct flow_cache *fc = &flow_cache_global;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle, *tfle;
- struct hlist_node *entry;
struct flow_cache_object *flo;
+ size_t keysize;
unsigned int hash;
local_bh_disable();
@@ -222,6 +204,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
fle = NULL;
flo = NULL;
+
+ keysize = flow_key_size(family);
+ if (!keysize)
+ goto nocache;
+
/* Packet really early in init? Making flow_cache_init a
* pre-smp initcall would solve this. --RR */
if (!fcp->hash_table)
@@ -230,11 +217,12 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
if (fcp->hash_rnd_recalc)
flow_new_hash_rnd(fc, fcp);
- hash = flow_hash_code(fc, fcp, key);
- hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
- if (tfle->family == family &&
+ hash = flow_hash_code(fc, fcp, key, keysize);
+ hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
+ if (tfle->net == net &&
+ tfle->family == family &&
tfle->dir == dir &&
- flow_key_compare(key, &tfle->key) == 0) {
+ flow_key_compare(key, &tfle->key, keysize) == 0) {
fle = tfle;
break;
}
@@ -246,14 +234,15 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
if (fle) {
+ fle->net = net;
fle->family = family;
fle->dir = dir;
- memcpy(&fle->key, key, sizeof(*key));
+ memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
fle->object = NULL;
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
fcp->hash_count++;
}
- } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+ } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
flo = fle->object;
if (!flo)
goto ret_object;
@@ -274,13 +263,13 @@ nocache:
}
flo = resolver(net, key, family, dir, flo, ctx);
if (fle) {
- fle->genid = atomic_read(&flow_cache_genid);
+ fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
if (!IS_ERR(flo))
fle->object = flo;
else
fle->genid--;
} else {
- if (flo && !IS_ERR(flo))
+ if (!IS_ERR_OR_NULL(flo))
flo->ops->delete(flo);
}
ret_object:
@@ -295,15 +284,17 @@ static void flow_cache_flush_tasklet(unsigned long data)
struct flow_cache *fc = info->cache;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle;
- struct hlist_node *entry, *tmp;
+ struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
fcp = this_cpu_ptr(fc->percpu);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
- hlist_for_each_entry_safe(fle, entry, tmp,
+ hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
- if (flow_entry_valid(fle))
+ if (flow_entry_valid(fle, xfrm))
continue;
deleted++;
@@ -312,47 +303,94 @@ static void flow_cache_flush_tasklet(unsigned long data)
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
}
+/*
+ * Return whether a cpu needs flushing. Conservatively, we assume
+ * the presence of any entries means the core may require flushing,
+ * since the flow_cache_ops.check() function may assume it's running
+ * on the same core as the per-cpu cache component.
+ */
+static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp;
+ int i;
+
+ fcp = per_cpu_ptr(fc->percpu, cpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++)
+ if (!hlist_empty(&fcp->hash_table[i]))
+ return 0;
+ return 1;
+}
+
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
- int cpu;
struct tasklet_struct *tasklet;
- cpu = smp_processor_id();
- tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+ tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
{
struct flow_flush_info info;
- static DEFINE_MUTEX(flow_flush_sem);
+ cpumask_var_t mask;
+ int i, self;
+
+ /* Track which cpus need flushing to avoid disturbing all cores. */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return;
+ cpumask_clear(mask);
/* Don't want cpus going down or up during this. */
get_online_cpus();
- mutex_lock(&flow_flush_sem);
- info.cache = &flow_cache_global;
- atomic_set(&info.cpuleft, num_online_cpus());
+ mutex_lock(&net->xfrm.flow_flush_sem);
+ info.cache = &net->xfrm.flow_cache_global;
+ for_each_online_cpu(i)
+ if (!flow_cache_percpu_empty(info.cache, i))
+ cpumask_set_cpu(i, mask);
+ atomic_set(&info.cpuleft, cpumask_weight(mask));
+ if (atomic_read(&info.cpuleft) == 0)
+ goto done;
+
init_completion(&info.completion);
local_bh_disable();
- smp_call_function(flow_cache_flush_per_cpu, &info, 0);
- flow_cache_flush_tasklet((unsigned long)&info);
+ self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
+ on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
+ if (self)
+ flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
- mutex_unlock(&flow_flush_sem);
+
+done:
+ mutex_unlock(&net->xfrm.flow_flush_sem);
put_online_cpus();
+ free_cpumask_var(mask);
}
-static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
+static void flow_cache_flush_task(struct work_struct *work)
+{
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+ struct net *net = container_of(xfrm, struct net, xfrm);
+
+ flow_cache_flush(net);
+}
+
+void flow_cache_flush_deferred(struct net *net)
+{
+ schedule_work(&net->xfrm.flow_cache_flush_work);
+}
+
+static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
{
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
@@ -370,11 +408,12 @@ static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
return 0;
}
-static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
+static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+ struct flow_cache *fc = container_of(nfb, struct flow_cache,
+ hotcpu_notifier);
int res, cpu = (unsigned long) hcpu;
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
@@ -393,9 +432,20 @@ static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static int __init flow_cache_init(struct flow_cache *fc)
+int flow_cache_init(struct net *net)
{
int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+ if (!flow_cachep)
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC, NULL);
+ spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+ INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+ INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+ INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+ mutex_init(&net->xfrm.flow_flush_sem);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -405,14 +455,18 @@ static int __init flow_cache_init(struct flow_cache *fc)
if (!fc->percpu)
return -ENOMEM;
+ cpu_notifier_register_begin();
+
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
- return -ENOMEM;
+ goto err;
}
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
- register_hotcpu_notifier(&fc->hotcpu_notifier);
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpu_notifier_register_done();
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -420,15 +474,38 @@ static int __init flow_cache_init(struct flow_cache *fc)
add_timer(&fc->rnd_timer);
return 0;
+
+err:
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ cpu_notifier_register_done();
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+
+ return -ENOMEM;
}
+EXPORT_SYMBOL(flow_cache_init);
-static int __init flow_cache_init_global(void)
+void flow_cache_fini(struct net *net)
{
- flow_cachep = kmem_cache_create("flow_cache",
- sizeof(struct flow_cache_entry),
- 0, SLAB_PANIC, NULL);
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
- return flow_cache_init(&flow_cache_global);
-}
+ del_timer_sync(&fc->rnd_timer);
+ unregister_hotcpu_notifier(&fc->hotcpu_notifier);
-module_init(flow_cache_init_global);
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+}
+EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 00000000000..107ed12a532
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,405 @@
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <net/flow_keys.h>
+
+/* copy saddr & daddr, possibly using 64bit load/store
+ * Equivalent to : flow->src = iph->saddr;
+ * flow->dst = iph->daddr;
+ */
+static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+{
+ BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
+ offsetof(typeof(*flow), src) + sizeof(flow->src));
+ memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+}
+
+/**
+ * skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: buffer to extract the ports from
+ * @thoff: transport header offset
+ * @ip_proto: protocol for which to get port offset
+ *
+ * The function will try to retrieve the ports at offset thoff + poff where poff
+ * is the protocol port offset returned from proto_ports_offset
+ */
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
+{
+ int poff = proto_ports_offset(ip_proto);
+
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ ports = skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), &_ports);
+ if (ports)
+ return *ports;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_flow_get_ports);
+
+bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+{
+ int nhoff = skb_network_offset(skb);
+ u8 ip_proto;
+ __be16 proto = skb->protocol;
+
+ memset(flow, 0, sizeof(*flow));
+
+again:
+ switch (proto) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+ip:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph || iph->ihl < 5)
+ return false;
+ nhoff += iph->ihl * 4;
+
+ ip_proto = iph->protocol;
+ if (ip_is_fragment(iph))
+ ip_proto = 0;
+
+ iph_to_flow_copy_addrs(flow, iph);
+ break;
+ }
+ case htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+ipv6:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ ip_proto = iph->nexthdr;
+ flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+ flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+ nhoff += sizeof(struct ipv6hdr);
+ break;
+ }
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan;
+ struct vlan_hdr _vlan;
+
+ vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ if (!vlan)
+ return false;
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ goto again;
+ }
+ case htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case htons(PPP_IP):
+ goto ip;
+ case htons(PPP_IPV6):
+ goto ipv6;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ } *hdr, _hdr;
+
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ /*
+ * Only look inside GRE if version zero and no
+ * routing
+ */
+ if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
+ nhoff += 4;
+ if (hdr->flags & GRE_KEY)
+ nhoff += 4;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = skb_header_pointer(skb, nhoff,
+ sizeof(_eth), &_eth);
+ if (!eth)
+ return false;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+ }
+ goto again;
+ }
+ break;
+ }
+ case IPPROTO_IPIP:
+ proto = htons(ETH_P_IP);
+ goto ip;
+ case IPPROTO_IPV6:
+ proto = htons(ETH_P_IPV6);
+ goto ipv6;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
+ flow->thoff = (u16) nhoff;
+
+ return true;
+}
+EXPORT_SYMBOL(skb_flow_dissect);
+
+static u32 hashrnd __read_mostly;
+static __always_inline void __flow_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+{
+ __flow_hash_secret_init();
+ return jhash_3words(a, b, c, hashrnd);
+}
+
+static __always_inline u32 __flow_hash_1word(u32 a)
+{
+ __flow_hash_secret_init();
+ return jhash_1word(a, hashrnd);
+}
+
+/*
+ * __skb_get_hash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash(struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 hash;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return;
+
+ if (keys.ports)
+ skb->l4_hash = 1;
+
+ /* get a consistent hash (same value on both flow directions) */
+ if (((__force u32)keys.dst < (__force u32)keys.src) ||
+ (((__force u32)keys.dst == (__force u32)keys.src) &&
+ ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
+ swap(keys.dst, keys.src);
+ swap(keys.port16[0], keys.port16[1]);
+ }
+
+ hash = __flow_hash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports);
+ if (!hash)
+ hash = 1;
+
+ skb->hash = hash;
+}
+EXPORT_SYMBOL(__skb_get_hash);
+
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16) skb->protocol;
+ hash = __flow_hash_1word(hash);
+
+ return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
+/* __skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 __skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 poff = 0;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ poff += keys.thoff;
+ switch (keys.ip_proto) {
+ case IPPROTO_TCP: {
+ const struct tcphdr *tcph;
+ struct tcphdr _tcph;
+
+ tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[raw_smp_processor_id()]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else {
+ u32 hash;
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16) skb->protocol ^
+ skb->hash;
+ hash = __flow_hash_1word(hash);
+ queue_index = map->queues[
+ ((u64)hash * map->len) >> 32];
+ }
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 7c2373321b7..6b5b6e7013c 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -14,7 +14,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -83,7 +82,7 @@ struct gen_estimator
{
struct list_head list;
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est *rate_est;
+ struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
int ewma_log;
u64 last_bytes;
@@ -168,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est)
static
struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est *rate_est)
+ const struct gnet_stats_rate_est64 *rate_est)
{
struct rb_node *p = est_root.rb_node;
@@ -204,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est *rate_est,
+ struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
struct nlattr *opt)
{
@@ -249,13 +248,6 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
}
EXPORT_SYMBOL(gen_new_estimator);
-static void __gen_kill_estimator(struct rcu_head *head)
-{
- struct gen_estimator *e = container_of(head,
- struct gen_estimator, e_rcu);
- kfree(e);
-}
-
/**
* gen_kill_estimator - remove a rate estimator
* @bstats: basic statistics
@@ -266,7 +258,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
* Note : Caller should respect an RCU grace period before freeing stats_lock
*/
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est *rate_est)
+ struct gnet_stats_rate_est64 *rate_est)
{
struct gen_estimator *e;
@@ -279,7 +271,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
write_unlock(&est_lock);
list_del_rcu(&e->list);
- call_rcu(&e->e_rcu, __gen_kill_estimator);
+ kfree_rcu(e, e_rcu);
}
spin_unlock_bh(&est_tree_lock);
}
@@ -298,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
* Returns 0 on success or a negative error code.
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est *rate_est,
+ struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
@@ -314,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator);
* Returns true if estimator is active, and false if not.
*/
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est *rate_est)
+ const struct gnet_stats_rate_est64 *rate_est)
{
bool res;
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 0452eb27a27..9d3d9e78397 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -27,7 +27,8 @@
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
{
- NLA_PUT(d->skb, type, size, buf);
+ if (nla_put(d->skb, type, size, buf))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -142,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est *r)
+ struct gnet_stats_rate_est64 *r)
{
+ struct gnet_stats_rate_est est;
+ int res;
+
if (b && !gen_estimator_active(b, r))
return 0;
+ est.bps = min_t(u64, UINT_MAX, r->bps);
+ /* we have some time before reaching 2^32 packets per second */
+ est.pps = r->pps;
+
if (d->compat_tc_stats) {
- d->tc_stats.bps = r->bps;
- d->tc_stats.pps = r->pps;
+ d->tc_stats.bps = est.bps;
+ d->tc_stats.pps = est.pps;
}
- if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+ if (d->tail) {
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+ if (res < 0 || est.bps == r->bps)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+ }
return 0;
}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index c40f27e7d20..e1ec45ab1e6 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -35,11 +35,11 @@
* in any case.
*/
-int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
+int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
{
int size, ct, err;
- if (m->msg_namelen) {
+ if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
void __user *namep;
namep = (void __user __force *) m->msg_name;
@@ -51,6 +51,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
m->msg_name = address;
} else {
m->msg_name = NULL;
+ m->msg_namelen = 0;
}
size = m->msg_iovlen * sizeof(struct iovec);
@@ -74,111 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
}
/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, iov->iov_len, len);
- if (copy_to_user(iov->iov_base, kdata, copy))
- return -EFAULT;
- kdata += copy;
- len -= copy;
- iov->iov_len -= copy;
- iov->iov_base += copy;
- }
- iov++;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovec);
-
-/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
- int offset, int len)
-{
- int copy;
- for (; len > 0; ++iov) {
- /* Skip over the finished iovecs */
- if (unlikely(offset >= iov->iov_len)) {
- offset -= iov->iov_len;
- continue;
- }
- copy = min_t(unsigned int, iov->iov_len - offset, len);
- if (copy_to_user(iov->iov_base + offset, kdata, copy))
- return -EFAULT;
- offset = 0;
- kdata += copy;
- len -= copy;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
-/*
- * Copy iovec to kernel. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, len, iov->iov_len);
- if (copy_from_user(kdata, iov->iov_base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- iov++;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovec);
-
-/*
- * Copy iovec from kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
- int offset, int len)
-{
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
- if (copy_from_user(kdata, base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov++;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-
-/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
@@ -262,3 +158,27 @@ out_fault:
goto out;
}
EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
+
+unsigned long iov_pages(const struct iovec *iov, int offset,
+ unsigned long nr_segs)
+{
+ unsigned long seg, base;
+ int pages = 0, len, size;
+
+ while (nr_segs && (offset >= iov->iov_len)) {
+ offset -= iov->iov_len;
+ ++iov;
+ --nr_segs;
+ }
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ base = (unsigned long)iov[seg].iov_base + offset;
+ len = iov[seg].iov_len - offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ pages += size;
+ offset = 0;
+ }
+
+ return pages;
+}
+EXPORT_SYMBOL(iov_pages);
diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h
deleted file mode 100644
index 283c2b993fb..00000000000
--- a/net/core/kmap_skb.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <linux/highmem.h>
-
-static inline void *kmap_skb_frag(const skb_frag_t *frag)
-{
-#ifdef CONFIG_HIGHMEM
- BUG_ON(in_irq());
-
- local_bh_disable();
-#endif
- return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
-}
-
-static inline void kunmap_skb_frag(void *vaddr)
-{
- kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ);
-#ifdef CONFIG_HIGHMEM
- local_bh_enable();
-#endif
-}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 01a1101b593..bd0767e6b2b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -76,10 +76,26 @@ static void rfc2863_policy(struct net_device *dev)
}
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ rfc2863_policy(dev);
+}
+
+
static bool linkwatch_urgent_event(struct net_device *dev)
{
- return netif_running(dev) && netif_carrier_ok(dev) &&
- qdisc_tx_changing(dev);
+ if (!netif_running(dev))
+ return false;
+
+ if (dev->ifindex != dev->iflink)
+ return true;
+
+ if (dev->priv_flags & IFF_TEAM_PORT)
+ return true;
+
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
@@ -115,22 +131,13 @@ static void linkwatch_schedule_work(int urgent)
delay = 0;
/*
- * This is true if we've scheduled it immeditately or if we don't
- * need an immediate execution and it's already pending.
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
*/
- if (schedule_delayed_work(&linkwatch_work, delay) == !delay)
- return;
-
- /* Don't bother if there is nothing urgent. */
- if (!test_bit(LW_URGENT, &linkwatch_flags))
- return;
-
- /* It's already running which is good enough. */
- if (!cancel_delayed_work(&linkwatch_work))
- return;
-
- /* Otherwise we reschedule it again for immediate exection. */
- schedule_delayed_work(&linkwatch_work, 0);
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_wq, &linkwatch_work, 0);
+ else
+ schedule_delayed_work(&linkwatch_work, delay);
}
@@ -140,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev)
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8cc8f9a79db..ef31fef25e5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -15,6 +15,8 @@
* Harald Welte Add neighbour cache statistics like rtstat
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -36,23 +38,16 @@
#include <linux/random.h>
#include <linux/string.h>
#include <linux/log2.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#define DEBUG
#define NEIGH_DEBUG 1
-
-#define NEIGH_PRINTK(x...) printk(x)
-#define NEIGH_NOPRINTK(x...) do { ; } while(0)
-#define NEIGH_PRINTK0 NEIGH_PRINTK
-#define NEIGH_PRINTK1 NEIGH_NOPRINTK
-#define NEIGH_PRINTK2 NEIGH_NOPRINTK
-
-#if NEIGH_DEBUG >= 1
-#undef NEIGH_PRINTK1
-#define NEIGH_PRINTK1 NEIGH_PRINTK
-#endif
-#if NEIGH_DEBUG >= 2
-#undef NEIGH_PRINTK2
-#define NEIGH_PRINTK2 NEIGH_PRINTK
-#endif
+#define neigh_dbg(level, fmt, ...) \
+do { \
+ if (level <= NEIGH_DEBUG) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
#define PNEIGH_HASHMASK 0xF
@@ -99,7 +94,7 @@ static const struct file_operations neigh_stat_seq_fops;
static DEFINE_RWLOCK(neigh_tbl_lock);
-static int neigh_blackhole(struct sk_buff *skb)
+static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
@@ -122,7 +117,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return base ? (net_random() % base) + (base >> 1) : 0;
+ return base ? (prandom_u32() % base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
@@ -138,7 +133,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- for (i = 0; i <= nht->hash_mask; i++) {
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np;
@@ -211,7 +206,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- for (i = 0; i <= nht->hash_mask; i++) {
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np = &nht->hash_buckets[i];
@@ -238,13 +233,14 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
we must kill timers etc. and move
it to safe state.
*/
- skb_queue_purge(&n->arp_queue);
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
- NEIGH_PRINTK2("neigh %p is stray.\n", n);
+ neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
@@ -273,7 +269,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -288,16 +284,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
- skb_queue_head_init(&n->arp_queue);
+ __skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
@@ -313,11 +310,18 @@ out_entries:
goto out;
}
-static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
+static void neigh_get_hash_rnd(u32 *x)
{
- size_t size = entries * sizeof(struct neighbour *);
+ get_random_bytes(x, sizeof(*x));
+ *x |= 1;
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+{
+ size_t size = (1 << shift) * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
- struct neighbour **buckets;
+ struct neighbour __rcu **buckets;
+ int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
@@ -325,16 +329,17 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
if (size <= PAGE_SIZE)
buckets = kzalloc(size, GFP_ATOMIC);
else
- buckets = (struct neighbour **)
+ buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
if (!buckets) {
kfree(ret);
return NULL;
}
- rcu_assign_pointer(ret->hash_buckets, buckets);
- ret->hash_mask = entries - 1;
- get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
+ ret->hash_buckets = buckets;
+ ret->hash_shift = shift;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
@@ -343,8 +348,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
- struct neighbour **buckets = nht->hash_buckets;
+ size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
+ struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
kfree(buckets);
@@ -354,21 +359,20 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
}
static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
- unsigned long new_entries)
+ unsigned long new_shift)
{
unsigned int i, hash;
struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
- BUG_ON(!is_power_of_2(new_entries));
old_nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- new_nht = neigh_hash_alloc(new_entries);
+ new_nht = neigh_hash_alloc(new_shift);
if (!new_nht)
return old_nht;
- for (i = 0; i <= old_nht->hash_mask; i++) {
+ for (i = 0; i < (1 << old_nht->hash_shift); i++) {
struct neighbour *n, *next;
for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
@@ -378,7 +382,7 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
- hash &= new_nht->hash_mask;
+ hash >>= (32 - new_nht->hash_shift);
next = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
@@ -407,7 +411,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
@@ -437,7 +441,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+ hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
@@ -456,13 +460,13 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
}
EXPORT_SYMBOL(neigh_lookup_nodev);
-struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev)
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
@@ -480,6 +484,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -487,16 +499,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
- n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+ n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
- nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
+ if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+ nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
@@ -509,20 +521,22 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
- neigh_hold(n1);
+ if (want_ref)
+ neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0;
- neigh_hold(n);
+ if (want_ref)
+ neigh_hold(n);
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK2("neigh %p is created.\n", n);
+ neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
@@ -532,7 +546,7 @@ out_neigh_release:
neigh_release(n);
goto out;
}
-EXPORT_SYMBOL(neigh_create);
+EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, int key_len)
{
@@ -677,51 +691,40 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
neigh_parms_destroy(parms);
}
-static void neigh_destroy_rcu(struct rcu_head *head)
-{
- struct neighbour *neigh = container_of(head, struct neighbour, rcu);
-
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
-}
/*
* neighbour must already be out of the table;
*
*/
void neigh_destroy(struct neighbour *neigh)
{
- struct hh_cache *hh;
+ struct net_device *dev = neigh->dev;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
- printk(KERN_WARNING
- "Destroying alive neighbour %p\n", neigh);
+ pr_warn("Destroying alive neighbour %p\n", neigh);
dump_stack();
return;
}
if (neigh_del_timer(neigh))
- printk(KERN_WARNING "Impossible event.\n");
+ pr_warn("Impossible event\n");
- while ((hh = neigh->hh) != NULL) {
- neigh->hh = hh->hh_next;
- hh->hh_next = NULL;
-
- write_seqlock_bh(&hh->hh_lock);
- hh->hh_output = neigh_blackhole;
- write_sequnlock_bh(&hh->hh_lock);
- hh_cache_put(hh);
- }
+ write_lock_bh(&neigh->lock);
+ __skb_queue_purge(&neigh->arp_queue);
+ write_unlock_bh(&neigh->lock);
+ neigh->arp_queue_len_bytes = 0;
- skb_queue_purge(&neigh->arp_queue);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
- dev_put(neigh->dev);
+ dev_put(dev);
neigh_parms_put(neigh->parms);
- NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+ neigh_dbg(2, "neigh %p is destroyed\n", neigh);
atomic_dec(&neigh->tbl->entries);
- call_rcu(&neigh->rcu, neigh_destroy_rcu);
+ kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
@@ -732,14 +735,9 @@ EXPORT_SYMBOL(neigh_destroy);
*/
static void neigh_suspect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->output = neigh->ops->output;
-
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->output;
}
/* Neighbour state is OK;
@@ -749,14 +747,9 @@ static void neigh_suspect(struct neighbour *neigh)
*/
static void neigh_connect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+ neigh_dbg(2, "neigh %p is connected\n", neigh);
neigh->output = neigh->ops->connected_output;
-
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->hh_output;
}
static void neigh_periodic_work(struct work_struct *work)
@@ -782,10 +775,13 @@ static void neigh_periodic_work(struct work_struct *work)
tbl->last_rand = jiffies;
for (p = &tbl->parms; p; p = p->next)
p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
- for (i = 0 ; i <= nht->hash_mask; i++) {
+ if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ goto out;
+
+ for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
@@ -805,7 +801,7 @@ static void neigh_periodic_work(struct work_struct *work)
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
- time_after(jiffies, n->used + n->parms->gc_staletime))) {
+ time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
n->dead = 1;
write_unlock(&n->lock);
@@ -824,22 +820,26 @@ next_elt:
write_unlock_bh(&tbl->lock);
cond_resched();
write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
}
- /* Cycle through all hash buckets every base_reachable_time/2 ticks.
- * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
- * base_reachable_time.
+out:
+ /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
+ * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
+ * BASE_REACHABLE_TIME.
*/
- schedule_delayed_work(&tbl->gc_work,
- tbl->parms.base_reachable_time >> 1);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
write_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE) ?
- p->ucast_probes :
- p->ucast_probes + p->app_probes + p->mcast_probes;
+ int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
+ if (!(n->nud_state & NUD_PROBE))
+ max_probes += NEIGH_VAR(p, MCAST_PROBES);
+ return max_probes;
}
static void neigh_invalidate(struct neighbour *neigh)
@@ -849,7 +849,7 @@ static void neigh_invalidate(struct neighbour *neigh)
struct sk_buff *skb;
NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
- NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
+ neigh_dbg(2, "neigh %p is failed\n", neigh);
neigh->updated = jiffies;
/* It is very thin place. report_unreachable is very complicated
@@ -863,7 +863,21 @@ static void neigh_invalidate(struct neighbour *neigh)
neigh->ops->error_report(neigh, skb);
write_lock(&neigh->lock);
}
- skb_queue_purge(&neigh->arp_queue);
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_copy(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ kfree_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
@@ -872,7 +886,7 @@ static void neigh_timer_handler(unsigned long arg)
{
unsigned long now, next;
struct neighbour *neigh = (struct neighbour *)arg;
- unsigned state;
+ unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
@@ -881,27 +895,24 @@ static void neigh_timer_handler(unsigned long arg)
now = jiffies;
next = now + HZ;
- if (!(state & NUD_IN_TIMER)) {
-#ifndef CONFIG_SMP
- printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
-#endif
+ if (!(state & NUD_IN_TIMER))
goto out;
- }
if (state & NUD_REACHABLE) {
if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
- NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+ neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
- neigh->used + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh->used +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_suspect(neigh);
- next = now + neigh->parms->delay_probe_time;
+ next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->nud_state = NUD_STALE;
neigh->updated = jiffies;
neigh_suspect(neigh);
@@ -909,23 +920,24 @@ static void neigh_timer_handler(unsigned long arg)
}
} else if (state & NUD_DELAY) {
if (time_before_eq(now,
- neigh->confirmed + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
+ neigh->confirmed +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is now reachable\n", neigh);
neigh->nud_state = NUD_REACHABLE;
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
- NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
+ neigh_dbg(2, "neigh %p is probed\n", neigh);
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
- next = now + neigh->parms->retrans_time;
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + neigh->parms->retrans_time;
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -933,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh_invalidate(neigh);
+ goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
@@ -942,14 +955,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
- struct sk_buff *skb = skb_peek(&neigh->arp_queue);
- /* keep skb alive even if arp_queue overflows */
- if (skb)
- skb = skb_copy(skb, GFP_ATOMIC);
- write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
- atomic_inc(&neigh->probes);
- kfree_skb(skb);
+ neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
@@ -964,7 +970,7 @@ out:
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
- unsigned long now;
+ bool immediate_probe = false;
write_lock_bh(&neigh->lock);
@@ -972,14 +978,19 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
- now = jiffies;
-
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
- if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
- atomic_set(&neigh->probes, neigh->parms->ucast_probes);
+ if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
+ NEIGH_VAR(neigh->parms, APP_PROBES)) {
+ unsigned long next, now = jiffies;
+
+ atomic_set(&neigh->probes,
+ NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh->nud_state = NUD_INCOMPLETE;
- neigh->updated = jiffies;
- neigh_add_timer(neigh, now + 1);
+ neigh->updated = now;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/2);
+ neigh_add_timer(neigh, next);
+ immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
@@ -989,34 +1000,43 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
- neigh_add_timer(neigh,
- jiffies + neigh->parms->delay_probe_time);
+ neigh_add_timer(neigh, jiffies +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
- if (skb_queue_len(&neigh->arp_queue) >=
- neigh->parms->queue_len) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
+
buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
- write_unlock_bh(&neigh->lock);
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
return rc;
}
EXPORT_SYMBOL(__neigh_event_send);
-static void neigh_update_hhs(const struct neighbour *neigh)
+static void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1026,7 +1046,8 @@ static void neigh_update_hhs(const struct neighbour *neigh)
update = neigh->dev->header_ops->cache_update;
if (update) {
- for (hh = neigh->hh; hh; hh = hh->hh_next) {
+ hh = &neigh->hh;
+ if (hh->hh_len) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
@@ -1149,6 +1170,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh->parms->reachable_time :
0)));
neigh->nud_state = new;
+ notify = 1;
}
if (lladdr != neigh->ha) {
@@ -1158,7 +1180,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
- (neigh->parms->base_reachable_time << 1);
+ (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
}
if (new == old)
@@ -1174,15 +1196,34 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct neighbour *n1 = neigh;
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
- /* On shaper/eql skb->dst->neighbour != neigh :( */
- if (skb_dst(skb) && skb_dst(skb)->neighbour)
- n1 = skb_dst(skb)->neighbour;
- n1->output(skb);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ n1->output(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
write_lock_bh(&neigh->lock);
}
- skb_queue_purge(&neigh->arp_queue);
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
@@ -1199,6 +1240,21 @@ out:
}
EXPORT_SYMBOL(neigh_update);
+/* Update the neigh to listen temporarily for probe responses, even if it is
+ * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
+ */
+void __neigh_set_probe_once(struct neighbour *neigh)
+{
+ neigh->updated = jiffies;
+ if (!(neigh->nud_state & NUD_FAILED))
+ return;
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
+ neigh_add_timer(neigh,
+ jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+}
+EXPORT_SYMBOL(__neigh_set_probe_once);
+
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev)
@@ -1212,67 +1268,21 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
}
EXPORT_SYMBOL(neigh_event_ns);
-static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
- __be16 protocol)
-{
- struct hh_cache *hh;
-
- smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
- for (hh = n->hh; hh; hh = hh->hh_next) {
- if (hh->hh_type == protocol) {
- atomic_inc(&hh->hh_refcnt);
- if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
- hh_cache_put(hh);
- return true;
- }
- }
- return false;
-}
-
/* called with read_lock_bh(&n->lock); */
-static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
- __be16 protocol)
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
{
- struct hh_cache *hh;
struct net_device *dev = dst->dev;
-
- if (likely(neigh_hh_lookup(n, dst, protocol)))
- return;
-
- /* slow path */
- hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
- if (!hh)
- return;
-
- seqlock_init(&hh->hh_lock);
- hh->hh_type = protocol;
- atomic_set(&hh->hh_refcnt, 2);
-
- if (dev->header_ops->cache(n, hh)) {
- kfree(hh);
- return;
- }
+ __be16 prot = dst->ops->protocol;
+ struct hh_cache *hh = &n->hh;
write_lock_bh(&n->lock);
- /* must check if another thread already did the insert */
- if (neigh_hh_lookup(n, dst, protocol)) {
- kfree(hh);
- goto end;
- }
-
- if (n->nud_state & NUD_CONNECTED)
- hh->hh_output = n->ops->hh_output;
- else
- hh->hh_output = n->ops->output;
-
- hh->hh_next = n->hh;
- smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
- n->hh = hh;
+ /* Only one thread can come in here and initialize the
+ * hh_cache entry.
+ */
+ if (!hh->hh_len)
+ dev->header_ops->cache(n, hh, prot);
- if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
- hh_cache_put(hh);
-end:
write_unlock_bh(&n->lock);
}
@@ -1281,7 +1291,7 @@ end:
* but resolution is not made yet.
*/
-int neigh_compat_output(struct sk_buff *skb)
+int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -1289,7 +1299,7 @@ int neigh_compat_output(struct sk_buff *skb)
if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
skb->len) < 0 &&
- dev->header_ops->rebuild(skb))
+ dev_rebuild_header(skb))
return 0;
return dev_queue_xmit(skb);
@@ -1298,43 +1308,38 @@ EXPORT_SYMBOL(neigh_compat_output);
/* Slow and careful. */
-int neigh_resolve_output(struct sk_buff *skb)
+int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct neighbour *neigh;
int rc = 0;
- if (!dst || !(neigh = dst->neighbour))
+ if (!dst)
goto discard;
- __skb_pull(skb, skb_network_offset(skb));
-
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
- if (dev->header_ops->cache &&
- !dst->hh &&
- !(dst->flags & DST_NOCACHE))
- neigh_hh_init(neigh, dst, dst->ops->protocol);
+ if (dev->header_ops->cache && !neigh->hh.hh_len)
+ neigh_hh_init(neigh, dst);
do {
+ __skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
- rc = neigh->ops->queue_xmit(skb);
+ rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
discard:
- NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
- dst, dst ? dst->neighbour : NULL);
+ neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
@@ -1344,24 +1349,21 @@ EXPORT_SYMBOL(neigh_resolve_output);
/* As fast as possible without hh cache */
-int neigh_connected_output(struct sk_buff *skb)
+int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
- int err;
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *neigh = dst->neighbour;
struct net_device *dev = neigh->dev;
unsigned int seq;
-
- __skb_pull(skb, skb_network_offset(skb));
+ int err;
do {
+ __skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
- err = neigh->ops->queue_xmit(skb);
+ err = dev_queue_xmit(skb);
else {
err = -EINVAL;
kfree_skb(skb);
@@ -1370,6 +1372,12 @@ int neigh_connected_output(struct sk_buff *skb)
}
EXPORT_SYMBOL(neigh_connected_output);
+int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_direct_output);
+
static void neigh_proxy_process(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
@@ -1384,11 +1392,15 @@ static void neigh_proxy_process(unsigned long arg)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
+
__skb_unlink(skb, &tbl->proxy_queue);
- if (tbl->proxy_redo && netif_running(dev))
+ if (tbl->proxy_redo && netif_running(dev)) {
+ rcu_read_lock();
tbl->proxy_redo(skb);
- else
+ rcu_read_unlock();
+ } else {
kfree_skb(skb);
+ }
dev_put(dev);
} else if (!sched_next || tdif < sched_next)
@@ -1404,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
unsigned long now = jiffies;
- unsigned long sched_next = now + (net_random() % p->proxy_delay);
- if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+ unsigned long sched_next = now + (prandom_u32() %
+ NEIGH_VAR(p, PROXY_DELAY));
+
+ if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1434,7 +1448,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
for (p = &tbl->parms; p; p = p->next) {
if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
- (!p->dev && !ifindex))
+ (!p->dev && !ifindex && net_eq(net, &init_net)))
return p;
}
@@ -1444,34 +1458,34 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
struct neigh_table *tbl)
{
- struct neigh_parms *p, *ref;
+ struct neigh_parms *p;
struct net *net = dev_net(dev);
const struct net_device_ops *ops = dev->netdev_ops;
- ref = lookup_neigh_parms(tbl, net, 0);
- if (!ref)
- return NULL;
-
- p = kmemdup(ref, sizeof(*p), GFP_KERNEL);
+ p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
atomic_set(&p->refcnt, 1);
p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ dev_hold(dev);
+ p->dev = dev;
+ write_pnet(&p->net, hold_net(net));
+ p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+ release_net(net);
+ dev_put(dev);
kfree(p);
return NULL;
}
- dev_hold(dev);
- p->dev = dev;
- write_pnet(&p->net, hold_net(net));
- p->sysctl_table = NULL;
write_lock_bh(&tbl->lock);
p->next = tbl->parms.next;
tbl->parms.next = p;
write_unlock_bh(&tbl->lock);
+
+ neigh_parms_data_state_cleanall(p);
}
return p;
}
@@ -1504,7 +1518,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
}
}
write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK1("neigh_parms_release: not found\n");
+ neigh_dbg(1, "%s: not found\n", __func__);
}
EXPORT_SYMBOL(neigh_parms_release);
@@ -1516,7 +1530,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms)
static struct lock_class_key neigh_table_proxy_queue_class;
-void neigh_table_init_no_netlink(struct neigh_table *tbl)
+static void neigh_table_init_no_netlink(struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
@@ -1524,13 +1538,8 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
write_pnet(&tbl->parms.net, &init_net);
atomic_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
- neigh_rand_reach_time(tbl->parms.base_reachable_time);
+ neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep =
- kmem_cache_create(tbl->id, tbl->entry_size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
@@ -1541,7 +1550,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
panic("cannot create neighbour proc dir entry");
#endif
- tbl->nht = neigh_hash_alloc(8);
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
@@ -1549,9 +1558,16 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
+ if (!tbl->entry_size)
+ tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
+ tbl->key_len, NEIGH_PRIV_ALIGN);
+ else
+ WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
+
rwlock_init(&tbl->lock);
- INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
- schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ tbl->parms.reachable_time);
setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
@@ -1559,7 +1575,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
}
-EXPORT_SYMBOL(neigh_table_init_no_netlink);
void neigh_table_init(struct neigh_table *tbl)
{
@@ -1576,8 +1591,8 @@ void neigh_table_init(struct neigh_table *tbl)
write_unlock(&neigh_tbl_lock);
if (unlikely(tmp)) {
- printk(KERN_ERR "NEIGH: Registering multiple tables for "
- "family %d\n", tbl->family);
+ pr_err("Registering multiple tables for family %d\n",
+ tbl->family);
dump_stack();
}
}
@@ -1593,7 +1608,7 @@ int neigh_table_clear(struct neigh_table *tbl)
pneigh_queue_purge(&tbl->proxy_queue);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
- printk(KERN_CRIT "neighbour leakage\n");
+ pr_crit("neighbour leakage\n");
write_lock(&neigh_tbl_lock);
for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
if (*tp == tbl) {
@@ -1603,7 +1618,8 @@ int neigh_table_clear(struct neigh_table *tbl)
}
write_unlock(&neigh_tbl_lock);
- call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
tbl->nht = NULL;
kfree(tbl->phash_buckets);
@@ -1614,14 +1630,11 @@ int neigh_table_clear(struct neigh_table *tbl)
free_percpu(tbl->stats);
tbl->stats = NULL;
- kmem_cache_destroy(tbl->kmem_cachep);
- tbl->kmem_cachep = NULL;
-
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
-static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -1685,7 +1698,7 @@ out:
return err;
}
-static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -1791,25 +1804,36 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
if (nest == NULL)
return -ENOBUFS;
- if (parms->dev)
- NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
-
- NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
- NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
- NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
- NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
- NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
- NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
- NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
- NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
- parms->base_reachable_time);
- NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
- NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
- NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
- NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
- NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
- NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
-
+ if ((parms->dev &&
+ nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ nla_put_u32(skb, NDTPA_QUEUE_LEN,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
+ nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
+ nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
+ nla_put_u32(skb, NDTPA_UCAST_PROBES,
+ NEIGH_VAR(parms, UCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_PROBES,
+ NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
+ nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_GC_STALETIME,
+ NEIGH_VAR(parms, GC_STALETIME)) ||
+ nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
+ NEIGH_VAR(parms, DELAY_PROBE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_RETRANS_TIME,
+ NEIGH_VAR(parms, RETRANS_TIME)) ||
+ nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
+ NEIGH_VAR(parms, ANYCAST_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_PROXY_DELAY,
+ NEIGH_VAR(parms, PROXY_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_LOCKTIME,
+ NEIGH_VAR(parms, LOCKTIME)))
+ goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
@@ -1834,12 +1858,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
- NLA_PUT_STRING(skb, NDTA_NAME, tbl->id);
- NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
- NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
- NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
- NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
-
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
+ nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
+ nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
+ nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ goto nla_put_failure;
{
unsigned long now = jiffies;
unsigned int flush_delta = now - tbl->last_flush;
@@ -1856,11 +1880,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- ndc.ndtc_hash_rnd = nht->hash_rnd;
- ndc.ndtc_hash_mask = nht->hash_mask;
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
+ ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
rcu_read_unlock_bh();
- NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+ if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
+ goto nla_put_failure;
}
{
@@ -1885,7 +1910,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
}
- NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+ if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
+ goto nla_put_failure;
}
BUG_ON(tbl->parms.dev);
@@ -1958,7 +1984,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
};
-static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct neigh_table *tbl;
@@ -2022,45 +2048,68 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
switch (i) {
case NDTPA_QUEUE_LEN:
- p->queue_len = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN));
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_PROXY_QLEN:
- p->proxy_qlen = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, PROXY_QLEN,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_APP_PROBES:
- p->app_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, APP_PROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_UCAST_PROBES:
- p->ucast_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, UCAST_PROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_MCAST_PROBES:
- p->mcast_probes = nla_get_u32(tbp[i]);
+ NEIGH_VAR_SET(p, MCAST_PROBES,
+ nla_get_u32(tbp[i]));
break;
case NDTPA_BASE_REACHABLE_TIME:
- p->base_reachable_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_GC_STALETIME:
- p->gc_staletime = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, GC_STALETIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_DELAY_PROBE_TIME:
- p->delay_probe_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_RETRANS_TIME:
- p->retrans_time = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, RETRANS_TIME,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_ANYCAST_DELAY:
- p->anycast_delay = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, ANYCAST_DELAY,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_PROXY_DELAY:
- p->proxy_delay = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, PROXY_DELAY,
+ nla_get_msecs(tbp[i]));
break;
case NDTPA_LOCKTIME:
- p->locktime = nla_get_msecs(tbp[i]);
+ NEIGH_VAR_SET(p, LOCKTIME,
+ nla_get_msecs(tbp[i]));
break;
}
}
}
+ err = -ENOENT;
+ if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+ tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+ !net_eq(net, &init_net))
+ goto errout_tbl_lock;
+
if (tb[NDTA_THRESH1])
tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
@@ -2100,7 +2149,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
- if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
break;
@@ -2113,7 +2162,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
goto next;
if (neightbl_fill_param_info(skb, tbl, p,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
@@ -2152,7 +2201,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
- NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
+ if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
+ goto nla_put_failure;
read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
@@ -2172,8 +2222,39 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
- NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
- NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+ if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
+ nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+ u32 pid, u32 seq, int type, unsigned int flags,
+ struct neigh_table *tbl)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = tbl->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_type = RTN_UNICAST;
+ ndm->ndm_ifindex = pn->dev->ifindex;
+ ndm->ndm_state = NUD_NONE;
+
+ if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -2200,9 +2281,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- for (h = 0; h <= nht->hash_mask; h++) {
- if (h < s_h)
- continue;
+ for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
@@ -2212,7 +2291,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
continue;
if (idx < s_idx)
goto next;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) <= 0) {
@@ -2231,22 +2310,77 @@ out:
return rc;
}
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct pneigh_entry *n;
+ struct net *net = sock_net(skb->sk);
+ int rc, h, s_h = cb->args[3];
+ int idx, s_idx = idx = cb->args[4];
+
+ read_lock_bh(&tbl->lock);
+
+ for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+ if (dev_net(n->dev) != net)
+ continue;
+ if (idx < s_idx)
+ goto next;
+ if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI, tbl) <= 0) {
+ read_unlock_bh(&tbl->lock);
+ rc = -1;
+ goto out;
+ }
+ next:
+ idx++;
+ }
+ }
+
+ read_unlock_bh(&tbl->lock);
+ rc = skb->len;
+out:
+ cb->args[3] = h;
+ cb->args[4] = idx;
+ return rc;
+
+}
+
static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct neigh_table *tbl;
int t, family, s_t;
+ int proxy = 0;
+ int err;
read_lock(&neigh_tbl_lock);
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ /* check for full ndmsg structure presence, family member is
+ * the same for both structures
+ */
+ if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ proxy = 1;
+
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+ for (tbl = neigh_tables, t = 0; tbl;
+ tbl = tbl->next, t++) {
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
- if (neigh_dump_table(tbl, skb, cb) < 0)
+ if (proxy)
+ err = pneigh_dump_table(tbl, skb, cb);
+ else
+ err = neigh_dump_table(tbl, skb, cb);
+ if (err < 0)
break;
}
read_unlock(&neigh_tbl_lock);
@@ -2264,7 +2398,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
nht = rcu_dereference_bh(tbl->nht);
read_lock(&tbl->lock); /* avoid resizes */
- for (chain = 0; chain <= nht->hash_mask; chain++) {
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
@@ -2286,7 +2420,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
- for (chain = 0; chain <= nht->hash_mask; chain++) {
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
struct neighbour __rcu **np;
@@ -2323,7 +2457,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
int bucket = state->bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+ for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
n = rcu_dereference_bh(nht->hash_buckets[bucket]);
while (n) {
@@ -2390,7 +2524,7 @@ next:
if (n)
break;
- if (++state->bucket > nht->hash_mask)
+ if (++state->bucket >= (1 << nht->hash_shift))
break;
n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
@@ -2445,7 +2579,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
- pn = pn->next;
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
@@ -2625,7 +2762,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- sf->private = PDE(inode)->data;
+ sf->private = PDE_DATA(inode);
}
return ret;
};
@@ -2673,219 +2810,299 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-#ifdef CONFIG_ARPD
void neigh_app_ns(struct neighbour *n)
{
__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
}
EXPORT_SYMBOL(neigh_app_ns);
-#endif /* CONFIG_ARPD */
#ifdef CONFIG_SYSCTL
+static int zero;
+static int int_max = INT_MAX;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
+static int proc_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ struct ctl_table tmp = *ctl;
-#define NEIGH_VARS_MAX 19
+ tmp.extra1 = &zero;
+ tmp.extra2 = &unres_qlen_max;
+ tmp.data = &size;
+
+ size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
+ int index)
+{
+ struct net_device *dev;
+ int family = neigh_parms_family(p);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct neigh_parms *dst_p =
+ neigh_get_dev_parms_rcu(dev, family);
+
+ if (dst_p && !test_bit(index, dst_p->data_state))
+ dst_p->data[index] = p->data[index];
+ }
+ rcu_read_unlock();
+}
+
+static void neigh_proc_update(struct ctl_table *ctl, int write)
+{
+ struct net_device *dev = ctl->extra1;
+ struct neigh_parms *p = ctl->extra2;
+ struct net *net = neigh_parms_net(p);
+ int index = (int *) ctl->data - p->data;
+
+ if (!write)
+ return;
+
+ set_bit(index, p->data_state);
+ if (!dev) /* NULL dev means this is default value */
+ neigh_copy_dflt_parms(net, p, index);
+}
+
+static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &int_max;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec);
+
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
+
+static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
+
+static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+#define NEIGH_PARMS_DATA_OFFSET(index) \
+ (&((struct neigh_parms *) 0)->data[index])
+
+#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
+ [NEIGH_VAR_ ## attr] = { \
+ .procname = name, \
+ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ }
+
+#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
+
+#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
+
+#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VARS_MAX];
- char *dev_name;
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
- {
- .procname = "mcast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ucast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "app_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "retrans_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_userhz_jiffies,
- },
- {
- .procname = "base_reachable_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "delay_first_probe_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "gc_stale_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "unres_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "proxy_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "anycast_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_userhz_jiffies,
- },
- {
- .procname = "proxy_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_userhz_jiffies,
- },
- {
- .procname = "locktime",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_userhz_jiffies,
- },
- {
- .procname = "retrans_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
- {
- .procname = "base_reachable_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
- {
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
+ NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
+ [NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
- {
+ [NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
- {
+ [NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
{},
},
};
int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
- char *p_name, proc_handler *handler)
+ proc_handler *handler)
{
+ int i;
struct neigh_sysctl_table *t;
- const char *dev_name_source = NULL;
-
-#define NEIGH_CTL_PATH_ROOT 0
-#define NEIGH_CTL_PATH_PROTO 1
-#define NEIGH_CTL_PATH_NEIGH 2
-#define NEIGH_CTL_PATH_DEV 3
-
- struct ctl_path neigh_path[] = {
- { .procname = "net", },
- { .procname = "proto", },
- { .procname = "neigh", },
- { .procname = "default", },
- { },
- };
+ const char *dev_name_source;
+ char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
+ char *p_name;
t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
if (!t)
goto err;
- t->neigh_vars[0].data = &p->mcast_probes;
- t->neigh_vars[1].data = &p->ucast_probes;
- t->neigh_vars[2].data = &p->app_probes;
- t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->base_reachable_time;
- t->neigh_vars[5].data = &p->delay_probe_time;
- t->neigh_vars[6].data = &p->gc_staletime;
- t->neigh_vars[7].data = &p->queue_len;
- t->neigh_vars[8].data = &p->proxy_qlen;
- t->neigh_vars[9].data = &p->anycast_delay;
- t->neigh_vars[10].data = &p->proxy_delay;
- t->neigh_vars[11].data = &p->locktime;
- t->neigh_vars[12].data = &p->retrans_time;
- t->neigh_vars[13].data = &p->base_reachable_time;
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
+ t->neigh_vars[i].data += (long) p;
+ t->neigh_vars[i].extra1 = dev;
+ t->neigh_vars[i].extra2 = p;
+ }
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
- dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
- t->neigh_vars[14].data = (int *)(p + 1);
- t->neigh_vars[15].data = (int *)(p + 1) + 1;
- t->neigh_vars[16].data = (int *)(p + 1) + 2;
- t->neigh_vars[17].data = (int *)(p + 1) + 3;
+ struct neigh_table *tbl = p->tbl;
+ dev_name_source = "default";
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
-
if (handler) {
/* RetransTime */
- t->neigh_vars[3].proc_handler = handler;
- t->neigh_vars[3].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
/* ReachableTime */
- t->neigh_vars[4].proc_handler = handler;
- t->neigh_vars[4].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
/* RetransTime (in milliseconds)*/
- t->neigh_vars[12].proc_handler = handler;
- t->neigh_vars[12].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
/* ReachableTime (in milliseconds) */
- t->neigh_vars[13].proc_handler = handler;
- t->neigh_vars[13].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
}
- t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
- if (!t->dev_name)
- goto free;
+ /* Don't export sysctls to unprivileged users */
+ if (neigh_parms_net(p)->user_ns != &init_user_ns)
+ t->neigh_vars[0].procname = NULL;
- neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name;
- neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name;
+ switch (neigh_parms_family(p)) {
+ case AF_INET:
+ p_name = "ipv4";
+ break;
+ case AF_INET6:
+ p_name = "ipv6";
+ break;
+ default:
+ BUG();
+ }
+ snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
+ p_name, dev_name_source);
t->sysctl_header =
- register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl_table = t;
return 0;
-free_procname:
- kfree(t->dev_name);
free:
kfree(t);
err:
@@ -2898,8 +3115,7 @@ void neigh_sysctl_unregister(struct neigh_parms *p)
if (p->sysctl_table) {
struct neigh_sysctl_table *t = p->sysctl_table;
p->sysctl_table = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->dev_name);
+ unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
}
@@ -2909,12 +3125,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
static int __init neigh_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info);
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info);
- rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
+ NULL);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
return 0;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
new file mode 100644
index 00000000000..2bf83299600
--- /dev/null
+++ b/net/core/net-procfs.c
@@ -0,0 +1,423 @@
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/wext.h>
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct hlist_head *h;
+ unsigned int count = 0, offset = get_offset(*pos);
+
+ h = &net->dev_name_head[get_bucket(*pos)];
+ hlist_for_each_entry_rcu(dev, h, name_hlist) {
+ if (++count == offset)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net_device *dev;
+ unsigned int bucket;
+
+ do {
+ dev = dev_from_same_bucket(seq, pos);
+ if (dev)
+ return dev;
+
+ bucket = get_bucket(*pos) + 1;
+ *pos = set_bucket_offset(bucket, 1);
+ } while (bucket < NETDEV_HASHENTRIES);
+
+ return NULL;
+}
+
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
+ return NULL;
+
+ return dev_from_bucket(seq, pos);
+}
+
+static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return dev_from_bucket(seq, pos);
+}
+
+static void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+ "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+ struct softnet_data *sd = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ sd = &per_cpu(softnet_data, *pos);
+ break;
+ } else
+ ++*pos;
+ return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct softnet_data *sd = v;
+ unsigned int flow_limit_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl)
+ flow_limit_count = fl->count;
+ rcu_read_unlock();
+#endif
+
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ sd->processed, sd->dropped, sd->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ sd->cpu_collision, sd->received_rps, flow_limit_count);
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static int dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static int softnet_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &softnet_seq_ops);
+}
+
+static const struct file_operations softnet_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = softnet_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *ptype_get_idx(loff_t pos)
+{
+ struct packet_type *pt = NULL;
+ loff_t i = 0;
+ int t;
+
+ list_for_each_entry_rcu(pt, &ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (nxt != &ptype_all)
+ goto found;
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s %pf\n",
+ pt->dev ? pt->dev->name : "", pt->func);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ptype_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ptype_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ptype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+ goto out;
+ if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+ &softnet_seq_fops))
+ goto out_dev;
+ if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ remove_proc_entry("ptype", net->proc_net);
+out_softnet:
+ remove_proc_entry("softnet_stat", net->proc_net);
+out_dev:
+ remove_proc_entry("dev", net->proc_net);
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ remove_proc_entry("dev", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct netdev_hw_addr *ha;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(ha, dev) {
+ int i;
+
+ seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
+ dev->name, ha->refcount, ha->global_use);
+
+ for (i = 0; i < dev->addr_len; i++)
+ seq_printf(seq, "%02x", ha->addr[i]);
+
+ seq_putc(seq, '\n');
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int dev_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ remove_proc_entry("dev_mcast", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+int __init dev_proc_init(void)
+{
+ int ret = register_pernet_subsys(&dev_proc_ops);
+ if (!ret)
+ return register_pernet_subsys(&dev_mc_net_ops);
+ return ret;
+}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 85e8b5326dd..1cac29ebb05 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,9 +18,10 @@
#include <net/sock.h>
#include <net/net_namespace.h>
#include <linux/rtnetlink.h>
-#include <linux/wireless.h>
#include <linux/vmalloc.h>
-#include <net/wext.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
#include "net-sysfs.h"
@@ -28,6 +29,7 @@
static const char fmt_hex[] = "%#x\n";
static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_udec[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
@@ -58,35 +60,42 @@ static ssize_t format_##field(const struct net_device *net, char *buf) \
{ \
return sprintf(buf, format_string, net->field); \
} \
-static ssize_t show_##field(struct device *dev, \
+static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
return netdev_show(dev, attr, buf, format_##field); \
-}
+} \
+
+#define NETDEVICE_SHOW_RO(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RO(field)
+#define NETDEVICE_SHOW_RW(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RW(field)
/* use same locking and permission rules as SIF* ioctl's */
static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len,
int (*set)(struct net_device *, unsigned long))
{
- struct net_device *net = to_net_dev(dev);
- char *endp;
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
unsigned long new;
int ret = -EINVAL;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- new = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
goto err;
if (!rtnl_trylock())
return restart_syscall();
- if (dev_isalive(net)) {
- if ((ret = (*set)(net, new)) == 0)
+ if (dev_isalive(netdev)) {
+ if ((ret = (*set)(netdev, new)) == 0)
ret = len;
}
rtnl_unlock();
@@ -94,17 +103,17 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
return ret;
}
-NETDEVICE_SHOW(dev_id, fmt_hex);
-NETDEVICE_SHOW(addr_assign_type, fmt_dec);
-NETDEVICE_SHOW(addr_len, fmt_dec);
-NETDEVICE_SHOW(iflink, fmt_dec);
-NETDEVICE_SHOW(ifindex, fmt_dec);
-NETDEVICE_SHOW(features, fmt_long_hex);
-NETDEVICE_SHOW(type, fmt_dec);
-NETDEVICE_SHOW(link_mode, fmt_dec);
+NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
+NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW_RO(addr_len, fmt_dec);
+NETDEVICE_SHOW_RO(iflink, fmt_dec);
+NETDEVICE_SHOW_RO(ifindex, fmt_dec);
+NETDEVICE_SHOW_RO(type, fmt_dec);
+NETDEVICE_SHOW_RO(link_mode, fmt_dec);
/* use same locking rules as GIFHWADDR ioctl's */
-static ssize_t show_address(struct device *dev, struct device_attribute *attr,
+static ssize_t address_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct net_device *net = to_net_dev(dev);
@@ -116,17 +125,32 @@ static ssize_t show_address(struct device *dev, struct device_attribute *attr,
read_unlock(&dev_base_lock);
return ret;
}
+static DEVICE_ATTR_RO(address);
-static ssize_t show_broadcast(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t broadcast_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct net_device *net = to_net_dev(dev);
if (dev_isalive(net))
return sysfs_format_mac(buf, net->broadcast, net->addr_len);
return -EINVAL;
}
+static DEVICE_ATTR_RO(broadcast);
+
+static int change_carrier(struct net_device *net, unsigned long new_carrier)
+{
+ if (!netif_running(net))
+ return -EINVAL;
+ return dev_change_carrier(net, (bool) new_carrier);
+}
-static ssize_t show_carrier(struct device *dev,
+static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
+static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
@@ -135,8 +159,9 @@ static ssize_t show_carrier(struct device *dev,
}
return -EINVAL;
}
+static DEVICE_ATTR_RW(carrier);
-static ssize_t show_speed(struct device *dev,
+static ssize_t speed_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
@@ -145,19 +170,17 @@ static ssize_t show_speed(struct device *dev,
if (!rtnl_trylock())
return restart_syscall();
- if (netif_running(netdev) &&
- netdev->ethtool_ops &&
- netdev->ethtool_ops->get_settings) {
- struct ethtool_cmd cmd = { ETHTOOL_GSET };
-
- if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd))
+ ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
}
rtnl_unlock();
return ret;
}
+static DEVICE_ATTR_RO(speed);
-static ssize_t show_duplex(struct device *dev,
+static ssize_t duplex_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
@@ -166,19 +189,30 @@ static ssize_t show_duplex(struct device *dev,
if (!rtnl_trylock())
return restart_syscall();
- if (netif_running(netdev) &&
- netdev->ethtool_ops &&
- netdev->ethtool_ops->get_settings) {
- struct ethtool_cmd cmd = { ETHTOOL_GSET };
-
- if (!netdev->ethtool_ops->get_settings(netdev, &cmd))
- ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half");
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd)) {
+ const char *duplex;
+ switch (cmd.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sprintf(buf, "%s\n", duplex);
+ }
}
rtnl_unlock();
return ret;
}
+static DEVICE_ATTR_RO(duplex);
-static ssize_t show_dormant(struct device *dev,
+static ssize_t dormant_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
@@ -188,6 +222,7 @@ static ssize_t show_dormant(struct device *dev,
return -EINVAL;
}
+static DEVICE_ATTR_RO(dormant);
static const char *const operstates[] = {
"unknown",
@@ -199,7 +234,7 @@ static const char *const operstates[] = {
"up"
};
-static ssize_t show_operstate(struct device *dev,
+static ssize_t operstate_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const struct net_device *netdev = to_net_dev(dev);
@@ -216,35 +251,43 @@ static ssize_t show_operstate(struct device *dev,
return sprintf(buf, "%s\n", operstates[operstate]);
}
+static DEVICE_ATTR_RO(operstate);
+
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_changes));
+}
+static DEVICE_ATTR_RO(carrier_changes);
/* read-write attributes */
-NETDEVICE_SHOW(mtu, fmt_dec);
static int change_mtu(struct net_device *net, unsigned long new_mtu)
{
return dev_set_mtu(net, (int) new_mtu);
}
-static ssize_t store_mtu(struct device *dev, struct device_attribute *attr,
+static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
return netdev_store(dev, attr, buf, len, change_mtu);
}
-
-NETDEVICE_SHOW(flags, fmt_hex);
+NETDEVICE_SHOW_RW(mtu, fmt_dec);
static int change_flags(struct net_device *net, unsigned long new_flags)
{
- return dev_change_flags(net, (unsigned) new_flags);
+ return dev_change_flags(net, (unsigned int) new_flags);
}
-static ssize_t store_flags(struct device *dev, struct device_attribute *attr,
+static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
return netdev_store(dev, attr, buf, len, change_flags);
}
-
-NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
+NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
{
@@ -252,21 +295,26 @@ static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
return 0;
}
-static ssize_t store_tx_queue_len(struct device *dev,
+static ssize_t tx_queue_len_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
return netdev_store(dev, attr, buf, len, change_tx_queue_len);
}
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
-static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
+static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
size_t count = len;
ssize_t ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
/* ignore trailing newline */
@@ -281,7 +329,7 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
return ret < 0 ? ret : len;
}
-static ssize_t show_ifalias(struct device *dev,
+static ssize_t ifalias_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const struct net_device *netdev = to_net_dev(dev);
@@ -294,30 +342,70 @@ static ssize_t show_ifalias(struct device *dev,
rtnl_unlock();
return ret;
}
+static DEVICE_ATTR_RW(ifalias);
-static struct device_attribute net_class_attributes[] = {
- __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
- __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
- __ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
- __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
- __ATTR(iflink, S_IRUGO, show_iflink, NULL),
- __ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
- __ATTR(features, S_IRUGO, show_features, NULL),
- __ATTR(type, S_IRUGO, show_type, NULL),
- __ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
- __ATTR(address, S_IRUGO, show_address, NULL),
- __ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
- __ATTR(carrier, S_IRUGO, show_carrier, NULL),
- __ATTR(speed, S_IRUGO, show_speed, NULL),
- __ATTR(duplex, S_IRUGO, show_duplex, NULL),
- __ATTR(dormant, S_IRUGO, show_dormant, NULL),
- __ATTR(operstate, S_IRUGO, show_operstate, NULL),
- __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
- __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
- __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
- store_tx_queue_len),
- {}
+static int change_group(struct net_device *net, unsigned long new_group)
+{
+ dev_set_group(net, (int) new_group);
+ return 0;
+}
+
+static ssize_t group_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+NETDEVICE_SHOW(group, fmt_dec);
+static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+
+static ssize_t phys_port_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_port_id ppid;
+
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_id);
+
+static struct attribute *net_class_attrs[] = {
+ &dev_attr_netdev_group.attr,
+ &dev_attr_type.attr,
+ &dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
+ &dev_attr_iflink.attr,
+ &dev_attr_ifindex.attr,
+ &dev_attr_addr_assign_type.attr,
+ &dev_attr_addr_len.attr,
+ &dev_attr_link_mode.attr,
+ &dev_attr_address.attr,
+ &dev_attr_broadcast.attr,
+ &dev_attr_speed.attr,
+ &dev_attr_duplex.attr,
+ &dev_attr_dormant.attr,
+ &dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
+ &dev_attr_ifalias.attr,
+ &dev_attr_carrier.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_tx_queue_len.attr,
+ &dev_attr_phys_port_id.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(net_class);
/* Show a given an attribute in the statistics group */
static ssize_t netstat_show(const struct device *d,
@@ -343,13 +431,13 @@ static ssize_t netstat_show(const struct device *d,
/* generate a read-only statistics attribute */
#define NETSTAT_ENTRY(name) \
-static ssize_t show_##name(struct device *d, \
+static ssize_t name##_show(struct device *d, \
struct device_attribute *attr, char *buf) \
{ \
return netstat_show(d, attr, buf, \
offsetof(struct rtnl_link_stats64, name)); \
} \
-static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+static DEVICE_ATTR_RO(name)
NETSTAT_ENTRY(rx_packets);
NETSTAT_ENTRY(tx_packets);
@@ -408,63 +496,8 @@ static struct attribute_group netstat_group = {
.attrs = netstat_attrs,
};
-#ifdef CONFIG_WIRELESS_EXT_SYSFS
-/* helper function that does all the locking etc for wireless stats */
-static ssize_t wireless_show(struct device *d, char *buf,
- ssize_t (*format)(const struct iw_statistics *,
- char *))
-{
- struct net_device *dev = to_net_dev(d);
- const struct iw_statistics *iw;
- ssize_t ret = -EINVAL;
-
- if (!rtnl_trylock())
- return restart_syscall();
- if (dev_isalive(dev)) {
- iw = get_wireless_stats(dev);
- if (iw)
- ret = (*format)(iw, buf);
- }
- rtnl_unlock();
-
- return ret;
-}
-
-/* show function template for wireless fields */
-#define WIRELESS_SHOW(name, field, format_string) \
-static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
-{ \
- return sprintf(buf, format_string, iw->field); \
-} \
-static ssize_t show_iw_##name(struct device *d, \
- struct device_attribute *attr, char *buf) \
-{ \
- return wireless_show(d, buf, format_iw_##name); \
-} \
-static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
-
-WIRELESS_SHOW(status, status, fmt_hex);
-WIRELESS_SHOW(link, qual.qual, fmt_dec);
-WIRELESS_SHOW(level, qual.level, fmt_dec);
-WIRELESS_SHOW(noise, qual.noise, fmt_dec);
-WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
-WIRELESS_SHOW(crypt, discard.code, fmt_dec);
-WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
-WIRELESS_SHOW(misc, discard.misc, fmt_dec);
-WIRELESS_SHOW(retries, discard.retries, fmt_dec);
-WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
-
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
static struct attribute *wireless_attrs[] = {
- &dev_attr_status.attr,
- &dev_attr_link.attr,
- &dev_attr_level.attr,
- &dev_attr_noise.attr,
- &dev_attr_nwid.attr,
- &dev_attr_crypt.attr,
- &dev_attr_fragment.attr,
- &dev_attr_retries.attr,
- &dev_attr_misc.attr,
- &dev_attr_beacon.attr,
NULL
};
@@ -473,19 +506,12 @@ static struct attribute_group wireless_group = {
.attrs = wireless_attrs,
};
#endif
+
+#else /* CONFIG_SYSFS */
+#define net_class_groups NULL
#endif /* CONFIG_SYSFS */
-#ifdef CONFIG_RPS
-/*
- * RX queue sysfs structures and functions.
- */
-struct rx_queue_attribute {
- struct attribute attr;
- ssize_t (*show)(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr, char *buf);
- ssize_t (*store)(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr, const char *buf, size_t len);
-};
+#ifdef CONFIG_SYSFS
#define to_rx_queue_attr(_attr) container_of(_attr, \
struct rx_queue_attribute, attr)
@@ -520,6 +546,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
.store = rx_queue_attr_store,
};
+#ifdef CONFIG_RPS
static ssize_t show_rps_map(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attribute, char *buf)
{
@@ -550,13 +577,6 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
return len;
}
-static void rps_map_release(struct rcu_head *rcu)
-{
- struct rps_map *map = container_of(rcu, struct rps_map, rcu);
-
- kfree(map);
-}
-
static ssize_t store_rps_map(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attribute,
const char *buf, size_t len)
@@ -578,7 +598,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
return err;
}
- map = kzalloc(max_t(unsigned,
+ map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
if (!map) {
@@ -603,9 +623,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
rcu_assign_pointer(queue->rps_map, map);
spin_unlock(&rps_map_lock);
- if (old_map)
- call_rcu(&old_map->rcu, rps_map_release);
-
+ if (map)
+ static_key_slow_inc(&rps_needed);
+ if (old_map) {
+ kfree_rcu(old_map, rcu);
+ static_key_slow_dec(&rps_needed);
+ }
free_cpumask_var(mask);
return len;
}
@@ -615,65 +638,68 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
char *buf)
{
struct rps_dev_flow_table *flow_table;
- unsigned int val = 0;
+ unsigned long val = 0;
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = flow_table->mask + 1;
+ val = (unsigned long)flow_table->mask + 1;
rcu_read_unlock();
- return sprintf(buf, "%u\n", val);
-}
-
-static void rps_dev_flow_table_release_work(struct work_struct *work)
-{
- struct rps_dev_flow_table *table = container_of(work,
- struct rps_dev_flow_table, free_work);
-
- vfree(table);
+ return sprintf(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release(struct rcu_head *rcu)
{
struct rps_dev_flow_table *table = container_of(rcu,
struct rps_dev_flow_table, rcu);
-
- INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
- schedule_work(&table->free_work);
+ vfree(table);
}
static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attr,
const char *buf, size_t len)
{
- unsigned int count;
- char *endp;
+ unsigned long mask, count;
struct rps_dev_flow_table *table, *old_table;
static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- count = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
if (count) {
- int i;
-
- if (count > 1<<30) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check
+ * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
/* Enforce a limit to prevent overflow */
return -EINVAL;
}
- count = roundup_pow_of_two(count);
- table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
if (!table)
return -ENOMEM;
- table->mask = count - 1;
- for (i = 0; i < count; i++)
- table->flows[i].cpu = RPS_NO_CPU;
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
} else
table = NULL;
@@ -696,40 +722,58 @@ static struct rx_queue_attribute rps_cpus_attribute =
static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
&rps_cpus_attribute.attr,
&rps_dev_flow_table_cnt_attribute.attr,
+#endif
NULL
};
static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
struct rps_map *map;
struct rps_dev_flow_table *flow_table;
- map = rcu_dereference_raw(queue->rps_map);
+ map = rcu_dereference_protected(queue->rps_map, 1);
if (map) {
RCU_INIT_POINTER(queue->rps_map, NULL);
- call_rcu(&map->rcu, rps_map_release);
+ kfree_rcu(map, rcu);
}
- flow_table = rcu_dereference_raw(queue->rps_flow_table);
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
if (flow_table) {
RCU_INIT_POINTER(queue->rps_flow_table, NULL);
call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
}
+#endif
memset(kobj, 0, sizeof(*kobj));
dev_put(queue->dev);
}
+static const void *rx_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
static struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_attrs = rx_queue_default_attrs,
+ .namespace = rx_queue_namespace
};
static int rx_queue_add_kobject(struct net_device *net, int index)
@@ -741,25 +785,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
kobj->kset = net->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
- if (error) {
- kobject_put(kobj);
- return error;
+ if (error)
+ goto exit;
+
+ if (net->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+ if (error)
+ goto exit;
}
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
return error;
+exit:
+ kobject_put(kobj);
+ return error;
}
-#endif /* CONFIG_RPS */
+#endif /* CONFIG_SYSFS */
int
net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
int i;
int error = 0;
+#ifndef CONFIG_RPS
+ if (!net->sysfs_rx_queue_group)
+ return 0;
+#endif
for (i = old_num; i < new_num; i++) {
error = rx_queue_add_kobject(net, i);
if (error) {
@@ -768,8 +823,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}
- while (--i >= new_num)
+ while (--i >= new_num) {
+ if (net->sysfs_rx_queue_group)
+ sysfs_remove_group(&net->_rx[i].kobj,
+ net->sysfs_rx_queue_group);
kobject_put(&net->_rx[i].kobj);
+ }
return error;
#else
@@ -777,7 +836,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif
}
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
*/
@@ -823,15 +882,139 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
-static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
{
- struct net_device *dev = queue->dev;
- int i;
+ unsigned long trans_timeout;
- for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
- break;
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ value = DQL_MAX_LIMIT;
+ else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute =
+ __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
+ bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute =
+ __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ char *buf) \
+{ \
+ return bql_show(buf, queue->dql.FIELD); \
+} \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return bql_set(buf, len, &queue->dql.FIELD); \
+} \
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
+ __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
+ bql_set_ ## NAME);
+
+BQL_ATTR(limit, limit)
+BQL_ATTR(limit_max, max_limit)
+BQL_ATTR(limit_min, min_limit)
+
+static struct attribute *dql_attrs[] = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
+
+static struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int i;
+
+ i = queue - dev->_tx;
BUG_ON(i >= dev->num_tx_queues);
return i;
@@ -883,37 +1066,14 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
return len;
}
-static void xps_map_release(struct rcu_head *rcu)
-{
- struct xps_map *map = container_of(rcu, struct xps_map, rcu);
-
- kfree(map);
-}
-
-static void xps_dev_maps_release(struct rcu_head *rcu)
-{
- struct xps_dev_maps *dev_maps =
- container_of(rcu, struct xps_dev_maps, rcu);
-
- kfree(dev_maps);
-}
-
-static DEFINE_MUTEX(xps_map_mutex);
-#define xmap_dereference(P) \
- rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
-
static ssize_t store_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
- cpumask_var_t mask;
- int err, i, cpu, pos, map_len, alloc_len, need_set;
unsigned long index;
- struct xps_map *map, *new_map;
- struct xps_dev_maps *dev_maps, *new_dev_maps;
- int nonempty = 0;
- int numa_node = -2;
+ cpumask_var_t mask;
+ int err;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -929,169 +1089,50 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
return err;
}
- new_dev_maps = kzalloc(max_t(unsigned,
- XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
- if (!new_dev_maps) {
- free_cpumask_var(mask);
- return -ENOMEM;
- }
-
- mutex_lock(&xps_map_mutex);
-
- dev_maps = xmap_dereference(dev->xps_maps);
-
- for_each_possible_cpu(cpu) {
- map = dev_maps ?
- xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
- new_map = map;
- if (map) {
- for (pos = 0; pos < map->len; pos++)
- if (map->queues[pos] == index)
- break;
- map_len = map->len;
- alloc_len = map->alloc_len;
- } else
- pos = map_len = alloc_len = 0;
-
- need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
-#ifdef CONFIG_NUMA
- if (need_set) {
- if (numa_node == -2)
- numa_node = cpu_to_node(cpu);
- else if (numa_node != cpu_to_node(cpu))
- numa_node = -1;
- }
-#endif
- if (need_set && pos >= map_len) {
- /* Need to add queue to this CPU's map */
- if (map_len >= alloc_len) {
- alloc_len = alloc_len ?
- 2 * alloc_len : XPS_MIN_MAP_ALLOC;
- new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len),
- GFP_KERNEL,
- cpu_to_node(cpu));
- if (!new_map)
- goto error;
- new_map->alloc_len = alloc_len;
- for (i = 0; i < map_len; i++)
- new_map->queues[i] = map->queues[i];
- new_map->len = map_len;
- }
- new_map->queues[new_map->len++] = index;
- } else if (!need_set && pos < map_len) {
- /* Need to remove queue from this CPU's map */
- if (map_len > 1)
- new_map->queues[pos] =
- new_map->queues[--new_map->len];
- else
- new_map = NULL;
- }
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map);
- }
-
- /* Cleanup old maps */
- for_each_possible_cpu(cpu) {
- map = dev_maps ?
- xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
- if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map)
- call_rcu(&map->rcu, xps_map_release);
- if (new_dev_maps->cpu_map[cpu])
- nonempty = 1;
- }
-
- if (nonempty)
- rcu_assign_pointer(dev->xps_maps, new_dev_maps);
- else {
- kfree(new_dev_maps);
- rcu_assign_pointer(dev->xps_maps, NULL);
- }
-
- if (dev_maps)
- call_rcu(&dev_maps->rcu, xps_dev_maps_release);
-
- netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : -1);
-
- mutex_unlock(&xps_map_mutex);
+ err = netif_set_xps_queue(dev, mask, index);
free_cpumask_var(mask);
- return len;
-error:
- mutex_unlock(&xps_map_mutex);
-
- if (new_dev_maps)
- for_each_possible_cpu(i)
- kfree(rcu_dereference_protected(
- new_dev_maps->cpu_map[i],
- 1));
- kfree(new_dev_maps);
- free_cpumask_var(mask);
- return -ENOMEM;
+ return err ? : len;
}
static struct netdev_queue_attribute xps_cpus_attribute =
__ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+#endif
NULL
};
static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
- struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- unsigned long index;
- int i, pos, nonempty = 0;
-
- index = get_netdev_queue_index(queue);
- mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
-
- if (dev_maps) {
- for_each_possible_cpu(i) {
- map = xmap_dereference(dev_maps->cpu_map[i]);
- if (!map)
- continue;
-
- for (pos = 0; pos < map->len; pos++)
- if (map->queues[pos] == index)
- break;
-
- if (pos < map->len) {
- if (map->len > 1)
- map->queues[pos] =
- map->queues[--map->len];
- else {
- RCU_INIT_POINTER(dev_maps->cpu_map[i],
- NULL);
- call_rcu(&map->rcu, xps_map_release);
- map = NULL;
- }
- }
- if (map)
- nonempty = 1;
- }
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
- if (!nonempty) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- call_rcu(&dev_maps->rcu, xps_dev_maps_release);
- }
- }
+static const void *netdev_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
- mutex_unlock(&xps_map_mutex);
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
- memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ return ns;
}
static struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
.default_attrs = netdev_queue_default_attrs,
+ .namespace = netdev_queue_namespace,
};
static int netdev_queue_add_kobject(struct net_device *net, int index)
@@ -1103,22 +1144,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
kobj->kset = net->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto exit;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto exit;
+#endif
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
+ return 0;
+exit:
+ kobject_put(kobj);
return error;
}
-#endif /* CONFIG_XPS */
+#endif /* CONFIG_SYSFS */
int
netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
int i;
int error = 0;
@@ -1130,27 +1178,30 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}
- while (--i >= new_num)
- kobject_put(&net->_tx[i].kobj);
+ while (--i >= new_num) {
+ struct netdev_queue *queue = net->_tx + i;
+
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
return error;
#else
return 0;
-#endif
+#endif /* CONFIG_SYSFS */
}
static int register_queue_kobjects(struct net_device *net)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
net->queues_kset = kset_create_and_add("queues",
NULL, &net->dev.kobj);
if (!net->queues_kset)
return -ENOMEM;
-#endif
-
-#ifdef CONFIG_RPS
real_rx = net->real_num_rx_queues;
#endif
real_tx = net->real_num_tx_queues;
@@ -1177,21 +1228,33 @@ static void remove_queue_kobjects(struct net_device *net)
{
int real_rx = 0, real_tx = 0;
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
real_rx = net->real_num_rx_queues;
#endif
real_tx = net->real_num_tx_queues;
net_rx_queue_update_kobjects(net, real_rx, 0);
netdev_queue_update_kobjects(net, real_tx, 0);
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
kset_unregister(net->queues_kset);
#endif
}
-static const void *net_current_ns(void)
+static bool net_current_may_mount(void)
+{
+ struct net *net = current->nsproxy->net_ns;
+
+ return ns_capable(net->user_ns, CAP_SYS_ADMIN);
+}
+
+static void *net_grab_current_ns(void)
{
- return current->nsproxy->net_ns;
+ struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+ if (ns)
+ atomic_inc(&ns->passive);
+#endif
+ return ns;
}
static const void *net_initial_ns(void)
@@ -1206,23 +1269,14 @@ static const void *net_netlink_ns(struct sock *sk)
struct kobj_ns_type_operations net_ns_type_operations = {
.type = KOBJ_NS_TYPE_NET,
- .current_ns = net_current_ns,
+ .current_may_mount = net_current_may_mount,
+ .grab_current_ns = net_grab_current_ns,
.netlink_ns = net_netlink_ns,
.initial_ns = net_initial_ns,
+ .drop_ns = net_drop_ns,
};
EXPORT_SYMBOL_GPL(net_ns_type_operations);
-static void net_kobj_ns_exit(struct net *net)
-{
- kobj_ns_exit(KOBJ_NS_TYPE_NET, net);
-}
-
-static struct pernet_operations kobj_net_ops = {
- .exit = net_kobj_ns_exit,
-};
-
-
-#ifdef CONFIG_HOTPLUG
static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
{
struct net_device *dev = to_net_dev(d);
@@ -1241,7 +1295,6 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
exit:
return retval;
}
-#endif
/*
* netdev_release -- destroy and free a dead device.
@@ -1254,7 +1307,7 @@ static void netdev_release(struct device *d)
BUG_ON(dev->reg_state != NETREG_RELEASED);
kfree(dev->ifalias);
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
}
static const void *net_namespace(struct device *d)
@@ -1267,12 +1320,8 @@ static const void *net_namespace(struct device *d)
static struct class net_class = {
.name = "net",
.dev_release = netdev_release,
-#ifdef CONFIG_SYSFS
- .dev_attrs = net_class_attributes,
-#endif /* CONFIG_SYSFS */
-#ifdef CONFIG_HOTPLUG
+ .dev_groups = net_class_groups,
.dev_uevent = netdev_uevent,
-#endif
.ns_type = &net_ns_type_operations,
.namespace = net_namespace,
};
@@ -1288,6 +1337,8 @@ void netdev_unregister_kobject(struct net_device * net)
remove_queue_kobjects(net);
+ pm_runtime_set_memalloc_noio(dev, false);
+
device_del(dev);
}
@@ -1311,10 +1362,11 @@ int netdev_register_kobject(struct net_device *net)
groups++;
*groups++ = &netstat_group;
-#ifdef CONFIG_WIRELESS_EXT_SYSFS
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
if (net->ieee80211_ptr)
*groups++ = &wireless_group;
-#ifdef CONFIG_WIRELESS_EXT
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
else if (net->wireless_handlers)
*groups++ = &wireless_group;
#endif
@@ -1331,24 +1383,27 @@ int netdev_register_kobject(struct net_device *net)
return error;
}
+ pm_runtime_set_memalloc_noio(dev, true);
+
return error;
}
-int netdev_class_create_file(struct class_attribute *class_attr)
+int netdev_class_create_file_ns(struct class_attribute *class_attr,
+ const void *ns)
{
- return class_create_file(&net_class, class_attr);
+ return class_create_file_ns(&net_class, class_attr, ns);
}
-EXPORT_SYMBOL(netdev_class_create_file);
+EXPORT_SYMBOL(netdev_class_create_file_ns);
-void netdev_class_remove_file(struct class_attribute *class_attr)
+void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+ const void *ns)
{
- class_remove_file(&net_class, class_attr);
+ class_remove_file_ns(&net_class, class_attr, ns);
}
-EXPORT_SYMBOL(netdev_class_remove_file);
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
-int netdev_kobject_init(void)
+int __init netdev_kobject_init(void)
{
kobj_ns_type_register(&net_ns_type_operations);
- register_pernet_subsys(&kobj_net_ops);
return class_register(&net_class);
}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index bd7751ec1c4..2745a1b51e0 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -1,7 +1,7 @@
#ifndef __NET_SYSFS_H__
#define __NET_SYSFS_H__
-int netdev_kobject_init(void);
+int __init netdev_kobject_init(void);
int netdev_register_kobject(struct net_device *);
void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 7f1bb2aba03..ba3c0120786 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -11,6 +11,7 @@
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
+#include <linux/export.h>
#include <linux/netpoll.h>
#include <linux/sched.h>
#include <linux/delay.h>
@@ -28,6 +29,8 @@
#include <trace/events/skb.h>
#include <trace/events/net.h>
#include <trace/events/napi.h>
+#include <trace/events/sock.h>
+#include <trace/events/udp.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3f860261c5e..85b62691f4f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
@@ -8,6 +10,11 @@
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
@@ -17,22 +24,30 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-static DEFINE_MUTEX(net_mutex);
+DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
-struct net init_net;
+struct net init_net = {
+ .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+};
EXPORT_SYMBOL(init_net);
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
-static void net_generic_release(struct rcu_head *rcu)
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
+ size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
- ng = container_of(rcu, struct net_generic, rcu);
- kfree(ng);
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->len = max_gen_ptrs;
+
+ return ng;
}
static int net_assign_generic(struct net *net, int id, void *data)
@@ -48,8 +63,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
if (old_ng->len >= id)
goto assign;
- ng = kzalloc(sizeof(struct net_generic) +
- id * sizeof(void *), GFP_KERNEL);
+ ng = net_alloc_generic();
if (ng == NULL)
return -ENOMEM;
@@ -64,11 +78,10 @@ static int net_assign_generic(struct net *net, int id, void *data)
* the old copy for kfree after a grace period.
*/
- ng->len = id;
memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
rcu_assign_pointer(net->gen, ng);
- call_rcu(&old_ng->rcu, net_generic_release);
+ kfree_rcu(old_ng, rcu);
assign:
ng->ptr[id - 1] = data;
return 0;
@@ -76,21 +89,29 @@ assign:
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
- int err;
+ int err = -ENOMEM;
+ void *data = NULL;
+
if (ops->id && ops->size) {
- void *data = kzalloc(ops->size, GFP_KERNEL);
+ data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
- return -ENOMEM;
+ goto out;
err = net_assign_generic(net, *ops->id, data);
- if (err) {
- kfree(data);
- return err;
- }
+ if (err)
+ goto cleanup;
}
+ err = 0;
if (ops->init)
- return ops->init(net);
- return 0;
+ err = ops->init(net);
+ if (!err)
+ return 0;
+
+cleanup:
+ kfree(data);
+
+out:
+ return err;
}
static void ops_free(const struct pernet_operations *ops, struct net *net)
@@ -126,7 +147,7 @@ static void ops_free_list(const struct pernet_operations *ops,
/*
* setup_net runs the initializers for the network namespace object.
*/
-static __net_init int setup_net(struct net *net)
+static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
/* Must be called with net_mutex held */
const struct pernet_operations *ops, *saved_ops;
@@ -134,6 +155,9 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
atomic_set(&net->count, 1);
+ atomic_set(&net->passive, 1);
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&net->use_count, 0);
@@ -164,18 +188,6 @@ out_undo:
goto out;
}
-static struct net_generic *net_alloc_generic(void)
-{
- struct net_generic *ng;
- size_t generic_size = sizeof(struct net_generic) +
- INITIAL_NET_GEN_PTRS * sizeof(void *);
-
- ng = kzalloc(generic_size, GFP_KERNEL);
- if (ng)
- ng->len = INITIAL_NET_GEN_PTRS;
-
- return ng;
-}
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
@@ -207,8 +219,8 @@ static void net_free(struct net *net)
{
#ifdef NETNS_REFCNT_DEBUG
if (unlikely(atomic_read(&net->use_count) != 0)) {
- printk(KERN_EMERG "network namespace not free! Usage: %d\n",
- atomic_read(&net->use_count));
+ pr_emerg("network namespace not free! Usage: %d\n",
+ atomic_read(&net->use_count));
return;
}
#endif
@@ -216,16 +228,30 @@ static void net_free(struct net *net)
kmem_cache_free(net_cachep, net);
}
-static struct net *net_create(void)
+void net_drop_ns(void *p)
+{
+ struct net *ns = p;
+ if (ns && atomic_dec_and_test(&ns->passive))
+ net_free(ns);
+}
+
+struct net *copy_net_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct net *old_net)
{
struct net *net;
int rv;
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+
net = net_alloc();
if (!net)
return ERR_PTR(-ENOMEM);
+
+ get_user_ns(user_ns);
+
mutex_lock(&net_mutex);
- rv = setup_net(net);
+ rv = setup_net(net, user_ns);
if (rv == 0) {
rtnl_lock();
list_add_tail_rcu(&net->list, &net_namespace_list);
@@ -233,19 +259,13 @@ static struct net *net_create(void)
}
mutex_unlock(&net_mutex);
if (rv < 0) {
- net_free(net);
+ put_user_ns(user_ns);
+ net_drop_ns(net);
return ERR_PTR(rv);
}
return net;
}
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
-{
- if (!(flags & CLONE_NEWNET))
- return get_net(old_net);
- return net_create();
-}
-
static DEFINE_SPINLOCK(cleanup_list_lock);
static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
@@ -253,7 +273,7 @@ static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp;
- LIST_HEAD(net_kill_list);
+ struct list_head net_kill_list;
LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
@@ -296,7 +316,8 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
- net_free(net);
+ put_user_ns(net->user_ns);
+ net_drop_ns(net);
}
}
static DECLARE_WORK(net_cleanup_work, cleanup_net);
@@ -314,12 +335,30 @@ void __put_net(struct net *net)
}
EXPORT_SYMBOL_GPL(__put_net);
+struct net *get_net_ns_by_fd(int fd)
+{
+ struct proc_ns *ei;
+ struct file *file;
+ struct net *net;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ ei = get_proc_ns(file_inode(file));
+ if (ei->ns_ops == &netns_operations)
+ net = get_net(ei->ns);
+ else
+ net = ERR_PTR(-EINVAL);
+
+ fput(file);
+ return net;
+}
+
#else
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+struct net *get_net_ns_by_fd(int fd)
{
- if (flags & CLONE_NEWNET)
- return ERR_PTR(-EINVAL);
- return old_net;
+ return ERR_PTR(-EINVAL);
}
#endif
@@ -343,6 +382,21 @@ struct net *get_net_ns_by_pid(pid_t pid)
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+static __net_init int net_ns_net_init(struct net *net)
+{
+ return proc_alloc_inum(&net->proc_inum);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+ proc_free_inum(net->proc_inum);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+ .init = net_ns_net_init,
+ .exit = net_ns_net_exit,
+};
+
static int __init net_ns_init(void)
{
struct net_generic *ng;
@@ -365,7 +419,7 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
mutex_lock(&net_mutex);
- if (setup_net(&init_net))
+ if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
rtnl_lock();
@@ -374,6 +428,8 @@ static int __init net_ns_init(void)
mutex_unlock(&net_mutex);
+ register_pernet_subsys(&net_ns_ops);
+
return 0;
}
@@ -423,12 +479,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
- int err = 0;
- err = ops_init(ops, &init_net);
- if (err)
- ops_free(ops, &init_net);
- return err;
-
+ return ops_init(ops, &init_net);
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
@@ -458,6 +509,7 @@ again:
}
return error;
}
+ max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
}
error = __register_pernet_operations(list, ops);
if (error) {
@@ -573,3 +625,52 @@ void unregister_pernet_device(struct pernet_operations *ops)
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static void *netns_get(struct task_struct *task)
+{
+ struct net *net = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ rcu_read_unlock();
+
+ return net;
+}
+
+static void netns_put(void *ns)
+{
+ put_net(ns);
+}
+
+static int netns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct net *net = ns;
+
+ if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ put_net(nsproxy->net_ns);
+ nsproxy->net_ns = get_net(net);
+ return 0;
+}
+
+static unsigned int netns_inum(void *ns)
+{
+ struct net *net = ns;
+ return net->proc_inum;
+}
+
+const struct proc_ns_operations netns_operations = {
+ .name = "net",
+ .type = CLONE_NEWNET,
+ .get = netns_get,
+ .put = netns_put,
+ .install = netns_install,
+ .inum = netns_inum,
+};
+#endif
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
new file mode 100644
index 00000000000..30d903b19c6
--- /dev/null
+++ b/net/core/netclassid_cgroup.c
@@ -0,0 +1,111 @@
+/*
+ * net/core/netclassid_cgroup.c Classid Cgroupfs Handling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fdtable.h>
+#include <net/cls_cgroup.h>
+#include <net/sock.h>
+
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
+}
+
+struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
+}
+EXPORT_SYMBOL_GPL(task_cls_state);
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_cls_state *cs;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ return &cs->css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
+
+ if (parent)
+ cs->classid = parent->classid;
+
+ return 0;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_cls_state(css));
+}
+
+static int update_classid(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+
+ if (sock)
+ sock->sk->sk_classid = (u32)(unsigned long)v;
+
+ return 0;
+}
+
+static void cgrp_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ void *v = (void *)(unsigned long)cs->classid;
+ struct task_struct *p;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_classid, v);
+ task_unlock(p);
+ }
+}
+
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css_cls_state(css)->classid;
+}
+
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
+{
+ css_cls_state(css)->classid = (u32) value;
+
+ return 0;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = read_classid,
+ .write_u64 = write_classid,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_cls_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = cgrp_attach,
+ .base_cftypes = ss_files,
+};
diff --git a/net/core/netevent.c b/net/core/netevent.c
index 865f0ceb81f..f17ccd291d3 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -15,6 +15,7 @@
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
+#include <linux/export.h>
#include <net/netevent.h>
static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ee38acb6d46..e33937fb32a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -9,7 +9,10 @@
* Copyright (C) 2002 Red Hat, Inc.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/moduleparam.h>
+#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
@@ -23,8 +26,13 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/if_vlan.h>
#include <net/tcp.h>
#include <net/udp.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
#include <trace/events/napi.h>
@@ -35,26 +43,63 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
static struct sk_buff_head skb_pool;
-static atomic_t trapped;
+DEFINE_STATIC_SRCU(netpoll_srcu);
#define USEC_PER_POLL 50
-#define NETPOLL_RX_ENABLED 1
-#define NETPOLL_RX_DROP 2
-#define MAX_SKB_SIZE \
- (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
- sizeof(struct iphdr) + sizeof(struct ethhdr))
+#define MAX_SKB_SIZE \
+ (sizeof(struct ethhdr) + \
+ sizeof(struct iphdr) + \
+ sizeof(struct udphdr) + \
+ MAX_UDP_CHUNK)
static void zap_completion_queue(void);
-static void arp_reply(struct sk_buff *skb);
+static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
module_param(carrier_timeout, uint, 0644);
+#define np_info(np, fmt, ...) \
+ pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_err(np, fmt, ...) \
+ pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_notice(np, fmt, ...) \
+ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ skb->vlan_tci = 0;
+ }
+
+ status = ops->ndo_start_xmit(skb, dev);
+ if (status == NETDEV_TX_OK)
+ txq_trans_update(txq);
+
+out:
+ return status;
+}
+
static void queue_process(struct work_struct *work)
{
struct netpoll_info *npinfo =
@@ -64,51 +109,31 @@ static void queue_process(struct work_struct *work)
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
- const struct net_device_ops *ops = dev->netdev_ops;
struct netdev_queue *txq;
if (!netif_device_present(dev) || !netif_running(dev)) {
- __kfree_skb(skb);
+ kfree_skb(skb);
continue;
}
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
local_irq_save(flags);
- __netif_tx_lock(txq, smp_processor_id());
- if (netif_tx_queue_frozen_or_stopped(txq) ||
- ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (netif_xmit_frozen_or_stopped(txq) ||
+ netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
schedule_delayed_work(&npinfo->tx_work, HZ/10);
return;
}
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
}
}
-static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, __be32 saddr, __be32 daddr)
-{
- __wsum psum;
-
- if (uh->check == 0 || skb_csum_unnecessary(skb))
- return 0;
-
- psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE &&
- !csum_fold(csum_add(psum, skb->csum)))
- return 0;
-
- skb->csum = psum;
-
- return __skb_checksum_complete(skb);
-}
-
/*
* Check whether delayed processing was scheduled for our NIC. If so,
* we attempt to grab the poll lock and use ->poll() to pump the card.
@@ -119,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
* trylock here and interrupts are already disabled in the softirq
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
*/
-static int poll_one_napi(struct netpoll_info *npinfo,
- struct napi_struct *napi, int budget)
+static int poll_one_napi(struct napi_struct *napi, int budget)
{
int work;
@@ -137,74 +156,87 @@ static int poll_one_napi(struct netpoll_info *npinfo,
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
return budget;
- npinfo->rx_flags |= NETPOLL_RX_DROP;
- atomic_inc(&trapped);
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
+ WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
- atomic_dec(&trapped);
- npinfo->rx_flags &= ~NETPOLL_RX_DROP;
return budget - work;
}
-static void poll_napi(struct net_device *dev)
+static void poll_napi(struct net_device *dev, int budget)
{
struct napi_struct *napi;
- int budget = 16;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner != smp_processor_id() &&
spin_trylock(&napi->poll_lock)) {
- budget = poll_one_napi(dev->npinfo, napi, budget);
+ budget = poll_one_napi(napi, budget);
spin_unlock(&napi->poll_lock);
-
- if (!budget)
- break;
}
}
}
-static void service_arp_queue(struct netpoll_info *npi)
-{
- if (npi) {
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&npi->arp_tx)))
- arp_reply(skb);
- }
-}
-
-void netpoll_poll_dev(struct net_device *dev)
+static void netpoll_poll_dev(struct net_device *dev)
{
const struct net_device_ops *ops;
+ struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ int budget = 0;
- if (!dev || !netif_running(dev))
+ /* Don't do any rx activity if the dev_lock mutex is held
+ * the dev_open/close paths use this to block netpoll activity
+ * while changing device state
+ */
+ if (down_trylock(&ni->dev_lock))
return;
+ if (!netif_running(dev)) {
+ up(&ni->dev_lock);
+ return;
+ }
+
ops = dev->netdev_ops;
- if (!ops->ndo_poll_controller)
+ if (!ops->ndo_poll_controller) {
+ up(&ni->dev_lock);
return;
+ }
/* Process pending work on NIC */
ops->ndo_poll_controller(dev);
- poll_napi(dev);
+ poll_napi(dev, budget);
- service_arp_queue(dev->npinfo);
+ up(&ni->dev_lock);
zap_completion_queue();
}
-EXPORT_SYMBOL(netpoll_poll_dev);
-void netpoll_poll(struct netpoll *np)
+void netpoll_poll_disable(struct net_device *dev)
{
- netpoll_poll_dev(np->dev);
+ struct netpoll_info *ni;
+ int idx;
+ might_sleep();
+ idx = srcu_read_lock(&netpoll_srcu);
+ ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ if (ni)
+ down(&ni->dev_lock);
+ srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_poll);
+EXPORT_SYMBOL(netpoll_poll_disable);
+
+void netpoll_poll_enable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ rcu_read_lock();
+ ni = rcu_dereference(dev->npinfo);
+ if (ni)
+ up(&ni->dev_lock);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netpoll_poll_enable);
static void refill_skbs(void)
{
@@ -238,7 +270,7 @@ static void zap_completion_queue(void)
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
- if (skb->destructor) {
+ if (!skb_irq_freeable(skb)) {
atomic_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
} else {
@@ -265,7 +297,7 @@ repeat:
if (!skb) {
if (++count < 10) {
- netpoll_poll(np);
+ netpoll_poll_dev(np->dev);
goto repeat;
}
return NULL;
@@ -287,40 +319,37 @@ static int netpoll_owner_active(struct net_device *dev)
return 0;
}
+/* call with IRQ disabled */
void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
struct net_device *dev)
{
int status = NETDEV_TX_BUSY;
unsigned long tries;
- const struct net_device_ops *ops = dev->netdev_ops;
/* It is up to the caller to keep npinfo alive. */
- struct netpoll_info *npinfo = np->dev->npinfo;
+ struct netpoll_info *npinfo;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ npinfo = rcu_dereference_bh(np->dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
- __kfree_skb(skb);
+ dev_kfree_skb_irq(skb);
return;
}
/* don't get messages out of order, and no recursion */
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
struct netdev_queue *txq;
- unsigned long flags;
- txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+ txq = netdev_pick_tx(dev, skb, NULL);
- local_irq_save(flags);
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
- if (__netif_tx_trylock(txq)) {
- if (!netif_tx_queue_stopped(txq)) {
- dev->priv_flags |= IFF_IN_NETPOLL;
- status = ops->ndo_start_xmit(skb, dev);
- dev->priv_flags &= ~IFF_IN_NETPOLL;
- if (status == NETDEV_TX_OK)
- txq_trans_update(txq);
- }
- __netif_tx_unlock(txq);
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
+
+ HARD_TX_UNLOCK(dev, txq);
if (status == NETDEV_TX_OK)
break;
@@ -328,16 +357,15 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
}
/* tickle device maybe there is some cleanup */
- netpoll_poll(np);
+ netpoll_poll_dev(np->dev);
udelay(USEC_PER_POLL);
}
WARN_ONCE(!irqs_disabled(),
- "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n",
- dev->name, ops->ndo_start_xmit);
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+ dev->name, dev->netdev_ops->ndo_start_xmit);
- local_irq_restore(flags);
}
if (status != NETDEV_TX_OK) {
@@ -349,22 +377,29 @@ EXPORT_SYMBOL(netpoll_send_skb_on_dev);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
- int total_len, eth_len, ip_len, udp_len;
+ int total_len, ip_len, udp_len;
struct sk_buff *skb;
struct udphdr *udph;
struct iphdr *iph;
struct ethhdr *eth;
+ static atomic_t ip_ident;
+ struct ipv6hdr *ip6h;
udp_len = len + sizeof(*udph);
- ip_len = eth_len = udp_len + sizeof(*iph);
- total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
+ if (np->ipv6)
+ ip_len = udp_len + sizeof(*ip6h);
+ else
+ ip_len = udp_len + sizeof(*iph);
+
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
- skb = find_skb(np, total_len, total_len - len);
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
if (!skb)
return;
skb_copy_to_linear_data(skb, msg, len);
- skb->len += len;
+ skb_put(skb, len);
skb_push(skb, sizeof(*udph));
skb_reset_transport_header(skb);
@@ -372,36 +407,68 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
udph->source = htons(np->local_port);
udph->dest = htons(np->remote_port);
udph->len = htons(udp_len);
- udph->check = 0;
- udph->check = csum_tcpudp_magic(np->local_ip,
- np->remote_ip,
- udp_len, IPPROTO_UDP,
- csum_partial(udph, udp_len, 0));
- if (udph->check == 0)
- udph->check = CSUM_MANGLED_0;
-
- skb_push(skb, sizeof(*iph));
- skb_reset_network_header(skb);
- iph = ip_hdr(skb);
-
- /* iph->version = 4; iph->ihl = 5; */
- put_unaligned(0x45, (unsigned char *)iph);
- iph->tos = 0;
- put_unaligned(htons(ip_len), &(iph->tot_len));
- iph->id = 0;
- iph->frag_off = 0;
- iph->ttl = 64;
- iph->protocol = IPPROTO_UDP;
- iph->check = 0;
- put_unaligned(np->local_ip, &(iph->saddr));
- put_unaligned(np->remote_ip, &(iph->daddr));
- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-
- eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb->protocol = eth->h_proto = htons(ETH_P_IP);
- memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN);
- memcpy(eth->h_dest, np->remote_mac, ETH_ALEN);
+
+ if (np->ipv6) {
+ udph->check = 0;
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ put_unaligned(0x60, (unsigned char *)ip6h);
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
+ } else {
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ put_unaligned(0x45, (unsigned char *)iph);
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &(iph->tot_len));
+ iph->id = htons(atomic_inc_return(&ip_ident));
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &(iph->saddr));
+ put_unaligned(np->remote_ip.ip, &(iph->daddr));
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IP);
+ }
+
+ ether_addr_copy(eth->h_source, np->dev->dev_addr);
+ ether_addr_copy(eth->h_dest, np->remote_mac);
skb->dev = np->dev;
@@ -409,251 +476,69 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
}
EXPORT_SYMBOL(netpoll_send_udp);
-static void arp_reply(struct sk_buff *skb)
+void netpoll_print_options(struct netpoll *np)
{
- struct netpoll_info *npinfo = skb->dev->npinfo;
- struct arphdr *arp;
- unsigned char *arp_ptr;
- int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
- __be32 sip, tip;
- unsigned char *sha;
- struct sk_buff *send_skb;
- struct netpoll *np, *tmp;
- unsigned long flags;
- int hits = 0;
-
- if (list_empty(&npinfo->rx_np))
- return;
-
- /* Before checking the packet, we do some early
- inspection whether this is interesting at all */
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->dev == skb->dev)
- hits++;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
- /* No netpoll struct is using this dev */
- if (!hits)
- return;
-
- /* No arp on this interface */
- if (skb->dev->flags & IFF_NOARP)
- return;
-
- if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
- return;
-
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- arp = arp_hdr(skb);
-
- if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
- arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_op != htons(ARPOP_REQUEST))
- return;
-
- arp_ptr = (unsigned char *)(arp+1);
- /* save the location of the src hw addr */
- sha = arp_ptr;
- arp_ptr += skb->dev->addr_len;
- memcpy(&sip, arp_ptr, 4);
- arp_ptr += 4;
- /* If we actually cared about dst hw addr,
- it would get copied here */
- arp_ptr += skb->dev->addr_len;
- memcpy(&tip, arp_ptr, 4);
-
- /* Should we ignore arp? */
- if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
- return;
-
- size = arp_hdr_len(skb->dev);
-
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (tip != np->local_ip)
- continue;
-
- send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
- LL_RESERVED_SPACE(np->dev));
- if (!send_skb)
- continue;
-
- skb_reset_network_header(send_skb);
- arp = (struct arphdr *) skb_put(send_skb, size);
- send_skb->dev = skb->dev;
- send_skb->protocol = htons(ETH_P_ARP);
-
- /* Fill the device header for the ARP frame */
- if (dev_hard_header(send_skb, skb->dev, ptype,
- sha, np->dev->dev_addr,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- continue;
- }
-
- /*
- * Fill out the arp protocol part.
- *
- * we only support ethernet device type,
- * which (according to RFC 1390) should
- * always equal 1 (Ethernet).
- */
-
- arp->ar_hrd = htons(np->dev->type);
- arp->ar_pro = htons(ETH_P_IP);
- arp->ar_hln = np->dev->addr_len;
- arp->ar_pln = 4;
- arp->ar_op = htons(type);
-
- arp_ptr = (unsigned char *)(arp + 1);
- memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &tip, 4);
- arp_ptr += 4;
- memcpy(arp_ptr, sha, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &sip, 4);
-
- netpoll_send_skb(np, send_skb);
-
- /* If there are several rx_hooks for the same address,
- we're fine by sending a single reply */
- break;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ np_info(np, "local port %d\n", np->local_port);
+ if (np->ipv6)
+ np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+ else
+ np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
+ np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "remote port %d\n", np->remote_port);
+ if (np->ipv6)
+ np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+ else
+ np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
+ np_info(np, "remote ethernet address %pM\n", np->remote_mac);
}
+EXPORT_SYMBOL(netpoll_print_options);
-int __netpoll_rx(struct sk_buff *skb)
+static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
{
- int proto, len, ulen;
- int hits = 0;
- struct iphdr *iph;
- struct udphdr *uh;
- struct netpoll_info *npinfo = skb->dev->npinfo;
- struct netpoll *np, *tmp;
+ const char *end;
- if (list_empty(&npinfo->rx_np))
- goto out;
-
- if (skb->dev->type != ARPHRD_ETHER)
- goto out;
-
- /* check if netpoll clients need ARP */
- if (skb->protocol == htons(ETH_P_ARP) &&
- atomic_read(&trapped)) {
- skb_queue_tail(&npinfo->arp_tx, skb);
- return 1;
+ if (!strchr(str, ':') &&
+ in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
+ if (!*end)
+ return 0;
}
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto != ETH_P_IP)
- goto out;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto out;
- if (skb_shared(skb))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
- if (iph->ihl < 5 || iph->version != 4)
- goto out;
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto out;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto out;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < iph->ihl*4)
- goto out;
-
- /*
- * Our transport medium may have padded the buffer out.
- * Now We trim to the true length of the frame.
- */
- if (pskb_trim_rcsum(skb, len))
- goto out;
-
- if (iph->protocol != IPPROTO_UDP)
- goto out;
-
- len -= iph->ihl*4;
- uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
- ulen = ntohs(uh->len);
-
- if (ulen != len)
- goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
- goto out;
-
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->local_ip && np->local_ip != iph->daddr)
- continue;
- if (np->remote_ip && np->remote_ip != iph->saddr)
- continue;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- continue;
-
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
- hits++;
- }
-
- if (!hits)
- goto out;
-
- kfree_skb(skb);
- return 1;
-
-out:
- if (atomic_read(&trapped)) {
- kfree_skb(skb);
- return 1;
+ if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!*end)
+ return 1;
+#else
+ return -1;
+#endif
}
-
- return 0;
-}
-
-void netpoll_print_options(struct netpoll *np)
-{
- printk(KERN_INFO "%s: local port %d\n",
- np->name, np->local_port);
- printk(KERN_INFO "%s: local IP %pI4\n",
- np->name, &np->local_ip);
- printk(KERN_INFO "%s: interface '%s'\n",
- np->name, np->dev_name);
- printk(KERN_INFO "%s: remote port %d\n",
- np->name, np->remote_port);
- printk(KERN_INFO "%s: remote IP %pI4\n",
- np->name, &np->remote_ip);
- printk(KERN_INFO "%s: remote ethernet address %pM\n",
- np->name, np->remote_mac);
+ return -1;
}
-EXPORT_SYMBOL(netpoll_print_options);
int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
+ int ipv6;
+ bool ipversion_set = false;
if (*cur != '@') {
if ((delim = strchr(cur, '@')) == NULL)
goto parse_failed;
*delim = 0;
- np->local_port = simple_strtol(cur, NULL, 10);
+ if (kstrtou16(cur, 10, &np->local_port))
+ goto parse_failed;
cur = delim;
}
cur++;
if (*cur != '/') {
+ ipversion_set = true;
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
- np->local_ip = in_aton(cur);
+ ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
cur = delim;
}
cur++;
@@ -674,9 +559,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
goto parse_failed;
*delim = 0;
if (*cur == ' ' || *cur == '\t')
- printk(KERN_INFO "%s: warning: whitespace"
- "is not allowed\n", np->name);
- np->remote_port = simple_strtol(cur, NULL, 10);
+ np_info(np, "warning: whitespace is not allowed\n");
+ if (kstrtou16(cur, 10, &np->remote_port))
+ goto parse_failed;
cur = delim;
}
cur++;
@@ -685,37 +570,19 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
- np->remote_ip = in_aton(cur);
+ ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
cur = delim + 1;
if (*cur != 0) {
/* MAC address */
- if ((delim = strchr(cur, ':')) == NULL)
+ if (!mac_pton(cur, np->remote_mac))
goto parse_failed;
- *delim = 0;
- np->remote_mac[0] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[1] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[2] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[3] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim = 0;
- np->remote_mac[4] = simple_strtol(cur, NULL, 16);
- cur = delim + 1;
- np->remote_mac[5] = simple_strtol(cur, NULL, 16);
}
netpoll_print_options(np);
@@ -723,24 +590,25 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
return 0;
parse_failed:
- printk(KERN_INFO "%s: couldn't parse config at '%s'!\n",
- np->name, cur);
+ np_info(np, "couldn't parse config at '%s'!\n", cur);
return -1;
}
EXPORT_SYMBOL(netpoll_parse_options);
-int __netpoll_setup(struct netpoll *np)
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
- struct net_device *ndev = np->dev;
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
- unsigned long flags;
int err;
+ np->dev = ndev;
+ strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
+
if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
!ndev->netdev_ops->ndo_poll_controller) {
- printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
- np->name, np->dev_name);
+ np_err(np, "%s doesn't support polling, aborting\n",
+ np->dev_name);
err = -ENOTSUPP;
goto out;
}
@@ -752,11 +620,7 @@ int __netpoll_setup(struct netpoll *np)
goto out;
}
- npinfo->rx_flags = 0;
- INIT_LIST_HEAD(&npinfo->rx_np);
-
- spin_lock_init(&npinfo->rx_lock);
- skb_queue_head_init(&npinfo->arp_tx);
+ sema_init(&npinfo->dev_lock, 1);
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
@@ -769,19 +633,12 @@ int __netpoll_setup(struct netpoll *np)
goto free_npinfo;
}
} else {
- npinfo = ndev->npinfo;
+ npinfo = rtnl_dereference(ndev->npinfo);
atomic_inc(&npinfo->refcnt);
}
npinfo->netpoll = np;
- if (np->rx_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- list_add_tail(&np->rx, &npinfo->rx_np);
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -800,37 +657,42 @@ int netpoll_setup(struct netpoll *np)
struct in_device *in_dev;
int err;
- if (np->dev_name)
- ndev = dev_get_by_name(&init_net, np->dev_name);
+ rtnl_lock();
+ if (np->dev_name) {
+ struct net *net = current->nsproxy->net_ns;
+ ndev = __dev_get_by_name(net, np->dev_name);
+ }
if (!ndev) {
- printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
- np->name, np->dev_name);
- return -ENODEV;
+ np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ err = -ENODEV;
+ goto unlock;
+ }
+ dev_hold(ndev);
+
+ if (netdev_master_upper_dev_get(ndev)) {
+ np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ err = -EBUSY;
+ goto put;
}
if (!netif_running(ndev)) {
unsigned long atmost, atleast;
- printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
- np->name, np->dev_name);
+ np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
- rtnl_lock();
err = dev_open(ndev);
- rtnl_unlock();
if (err) {
- printk(KERN_ERR "%s: failed to open %s\n",
- np->name, ndev->name);
+ np_err(np, "failed to open %s\n", ndev->name);
goto put;
}
+ rtnl_unlock();
atleast = jiffies + HZ/10;
atmost = jiffies + carrier_timeout * HZ;
while (!netif_carrier_ok(ndev)) {
if (time_after(jiffies, atmost)) {
- printk(KERN_NOTICE
- "%s: timeout waiting for carrier\n",
- np->name);
+ np_notice(np, "timeout waiting for carrier\n");
break;
}
msleep(1);
@@ -842,46 +704,73 @@ int netpoll_setup(struct netpoll *np)
*/
if (time_before(jiffies, atleast)) {
- printk(KERN_NOTICE "%s: carrier detect appears"
- " untrustworthy, waiting 4 seconds\n",
- np->name);
+ np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
msleep(4000);
}
+ rtnl_lock();
}
- if (!np->local_ip) {
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(ndev);
+ if (!np->local_ip.ip) {
+ if (!np->ipv6) {
+ in_dev = __in_dev_get_rtnl(ndev);
+
+ if (!in_dev || !in_dev->ifa_list) {
+ np_err(np, "no IP address for %s, aborting\n",
+ np->dev_name);
+ err = -EDESTADDRREQ;
+ goto put;
+ }
+
+ np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *idev;
- if (!in_dev || !in_dev->ifa_list) {
- rcu_read_unlock();
- printk(KERN_ERR "%s: no IP address for %s, aborting\n",
- np->name, np->dev_name);
err = -EDESTADDRREQ;
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ np->dev_name);
+ goto put;
+ } else
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+#else
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ np->dev_name);
+ err = -EINVAL;
goto put;
+#endif
}
-
- np->local_ip = in_dev->ifa_list->ifa_local;
- rcu_read_unlock();
- printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
}
- np->dev = ndev;
-
/* fill up the skb queue */
refill_skbs();
- rtnl_lock();
- err = __netpoll_setup(np);
- rtnl_unlock();
-
+ err = __netpoll_setup(np, ndev);
if (err)
goto put;
+ rtnl_unlock();
return 0;
put:
dev_put(ndev);
+unlock:
+ rtnl_unlock();
return err;
}
EXPORT_SYMBOL(netpoll_setup);
@@ -893,22 +782,36 @@ static int __init netpoll_init(void)
}
core_initcall(netpoll_init);
+static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
+{
+ struct netpoll_info *npinfo =
+ container_of(rcu_head, struct netpoll_info, rcu);
+
+ skb_queue_purge(&npinfo->txq);
+
+ /* we can't call cancel_delayed_work_sync here, as we are in softirq */
+ cancel_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ /* now cancel it again */
+ cancel_delayed_work(&npinfo->tx_work);
+ kfree(npinfo);
+}
+
void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- unsigned long flags;
- npinfo = np->dev->npinfo;
+ /* rtnl_dereference would be preferable here but
+ * rcu_cleanup_netpoll path can put us in here safely without
+ * holding the rtnl, so plain rcu_dereference it is
+ */
+ npinfo = rtnl_dereference(np->dev->npinfo);
if (!npinfo)
return;
- if (!list_empty(&npinfo->rx_np)) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_del(&np->rx);
- if (list_empty(&npinfo->rx_np))
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
+ synchronize_srcu(&netpoll_srcu);
if (atomic_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -917,47 +820,37 @@ void __netpoll_cleanup(struct netpoll *np)
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
- rcu_assign_pointer(np->dev->npinfo, NULL);
-
- /* avoid racing with NAPI reading npinfo */
- synchronize_rcu_bh();
-
- skb_queue_purge(&npinfo->arp_tx);
- skb_queue_purge(&npinfo->txq);
- cancel_rearming_delayed_work(&npinfo->tx_work);
-
- /* clean after last, unfinished work */
- __skb_queue_purge(&npinfo->txq);
- kfree(npinfo);
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
}
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
-void netpoll_cleanup(struct netpoll *np)
+static void netpoll_async_cleanup(struct work_struct *work)
{
- if (!np->dev)
- return;
+ struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
rtnl_lock();
__netpoll_cleanup(np);
rtnl_unlock();
-
- dev_put(np->dev);
- np->dev = NULL;
+ kfree(np);
}
-EXPORT_SYMBOL(netpoll_cleanup);
-int netpoll_trap(void)
+void __netpoll_free_async(struct netpoll *np)
{
- return atomic_read(&trapped);
+ schedule_work(&np->cleanup_work);
}
-EXPORT_SYMBOL(netpoll_trap);
+EXPORT_SYMBOL_GPL(__netpoll_free_async);
-void netpoll_set_trap(int trap)
+void netpoll_cleanup(struct netpoll *np)
{
- if (trap)
- atomic_inc(&trapped);
- else
- atomic_dec(&trapped);
+ rtnl_lock();
+ if (!np->dev)
+ goto out;
+ __netpoll_cleanup(np);
+ dev_put(np->dev);
+ np->dev = NULL;
+out:
+ rtnl_unlock();
}
-EXPORT_SYMBOL(netpoll_set_trap);
+EXPORT_SYMBOL(netpoll_cleanup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 00000000000..2f385b9bccc
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,288 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/fdtable.h>
+
+#define PRIOMAP_MIN_SZ 128
+
+/*
+ * Extend @dev->priomap so that it's large enough to accommodate
+ * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
+ * return. Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
+{
+ struct netprio_map *old, *new;
+ size_t new_sz, new_len;
+
+ /* is the existing priomap large enough? */
+ old = rtnl_dereference(dev->priomap);
+ if (old && old->priomap_len > target_idx)
+ return 0;
+
+ /*
+ * Determine the new size. Let's keep it power-of-two. We start
+ * from PRIOMAP_MIN_SZ and double it until it's large enough to
+ * accommodate @target_idx.
+ */
+ new_sz = PRIOMAP_MIN_SZ;
+ while (true) {
+ new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+ sizeof(new->priomap[0]);
+ if (new_len > target_idx)
+ break;
+ new_sz *= 2;
+ /* overflowed? */
+ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+ return -ENOSPC;
+ }
+
+ /* allocate & copy */
+ new = kzalloc(new_sz, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (old)
+ memcpy(new->priomap, old->priomap,
+ old->priomap_len * sizeof(old->priomap[0]));
+
+ new->priomap_len = new_len;
+
+ /* install the new priomap */
+ rcu_assign_pointer(dev->priomap, new);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
+{
+ struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+ int id = css->cgroup->id;
+
+ if (map && id < map->priomap_len)
+ return map->priomap[id];
+ return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup_subsys_state *css,
+ struct net_device *dev, u32 prio)
+{
+ struct netprio_map *map;
+ int id = css->cgroup->id;
+ int ret;
+
+ /* avoid extending priomap for zero writes */
+ map = rtnl_dereference(dev->priomap);
+ if (!prio && (!map || map->priomap_len <= id))
+ return 0;
+
+ ret = extend_netdev_table(dev, id);
+ if (ret)
+ return ret;
+
+ map = rtnl_dereference(dev->priomap);
+ map->priomap[id] = prio;
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kzalloc(sizeof(*css), GFP_KERNEL);
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *parent_css = css->parent;
+ struct net_device *dev;
+ int ret = 0;
+
+ if (!parent_css)
+ return 0;
+
+ rtnl_lock();
+ /*
+ * Inherit prios from the parent. As all prios are set during
+ * onlining, there is no need to clear them on offline.
+ */
+ for_each_netdev(&init_net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+ return ret;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css->cgroup->id;
+}
+
+static int read_priomap(struct seq_file *sf, void *v)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev)
+ seq_printf(sf, "%s %u\n", dev->name,
+ netprio_prio(seq_css(sf), dev));
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char devname[IFNAMSIZ + 1];
+ struct net_device *dev;
+ u32 prio;
+ int ret;
+
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ return -EINVAL;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ return -ENODEV;
+
+ rtnl_lock();
+
+ ret = netprio_set_prio(of_css(of), dev, prio);
+
+ rtnl_unlock();
+ dev_put(dev);
+ return ret ?: nbytes;
+}
+
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+ if (sock)
+ sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ return 0;
+}
+
+static void net_prio_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *p;
+ void *v = (void *)(unsigned long)css->cgroup->id;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_netprio, v);
+ task_unlock(p);
+ }
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .seq_show = read_priomap,
+ .write = write_priomap,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_prio_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = net_prio_attach,
+ .base_cftypes = ss_files,
+};
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netprio_map *old;
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ register_netdevice_notifier(&netprio_device_notifier);
+ return 0;
+}
+
+subsys_initcall(init_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 2953b2abc97..fc17a9d309a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -156,13 +156,17 @@
#include <linux/wait.h>
#include <linux/etherdevice.h>
#include <linux/kthread.h>
+#include <linux/prefetch.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#ifdef CONFIG_XFRM
#include <net/xfrm.h>
#endif
+#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
#include <linux/bitops.h>
@@ -196,6 +200,7 @@
#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */
#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
#define F_NODE (1<<15) /* Node memory alloc*/
+#define F_UDPCSUM (1<<16) /* Include UDP checksum */
/* Thread control flag bits */
#define T_STOP (1<<0) /* Stop run */
@@ -211,7 +216,6 @@
#define PKTGEN_MAGIC 0xbe9be955
#define PG_PROC_DIR "pktgen"
#define PGCTRL "pgctrl"
-static struct proc_dir_entry *pg_proc_dir;
#define MAX_CFLOWS 65536
@@ -247,10 +251,11 @@ struct pktgen_dev {
int removal_mark; /* non-zero => the device is marked for
* removal by worker thread */
- int min_pkt_size; /* = ETH_ZLEN; */
- int max_pkt_size; /* = ETH_ZLEN; */
+ int min_pkt_size;
+ int max_pkt_size;
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
+ struct page *page;
u64 delay; /* nano-seconds */
__u64 count; /* Default No packets to send */
@@ -318,7 +323,7 @@ struct pktgen_dev {
(see RFC 3260, sec. 4) */
/* MPLS */
- unsigned nr_labels; /* Depth of stack, 0 = no MPLS */
+ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
__be32 labels[MAX_MPLS_LABELS];
/* VLAN/SVLAN (802.1Q/Q-in-Q) */
@@ -371,10 +376,10 @@ struct pktgen_dev {
*/
char odevname[32];
struct flow_state *flows;
- unsigned cflows; /* Concurrent flows (config) */
- unsigned lflow; /* Flow length (config) */
- unsigned nflows; /* accumulated flows (stats) */
- unsigned curfl; /* current sequenced flow (state)*/
+ unsigned int cflows; /* Concurrent flows (config) */
+ unsigned int lflow; /* Flow length (config) */
+ unsigned int nflows; /* accumulated flows (stats) */
+ unsigned int curfl; /* current sequenced flow (state)*/
u16 queue_map_min;
u16 queue_map_max;
@@ -384,6 +389,9 @@ struct pktgen_dev {
#ifdef CONFIG_XFRM
__u8 ipsmode; /* IPSEC mode (config) */
__u8 ipsproto; /* IPSEC type (config) */
+ __u32 spi;
+ struct dst_entry dst;
+ struct dst_ops dstops;
#endif
char result[512];
};
@@ -395,7 +403,15 @@ struct pktgen_hdr {
__be32 tv_usec;
};
-static bool pktgen_exiting __read_mostly;
+
+static int pg_net_id __read_mostly;
+
+struct pktgen_net {
+ struct net *net;
+ struct proc_dir_entry *proc_dir;
+ struct list_head pktgen_threads;
+ bool pktgen_exiting;
+};
struct pktgen_thread {
spinlock_t if_lock; /* for list of devices */
@@ -412,25 +428,12 @@ struct pktgen_thread {
wait_queue_head_t queue;
struct completion start_done;
+ struct pktgen_net *net;
};
#define REMOVE 1
#define FIND 0
-static inline ktime_t ktime_now(void)
-{
- struct timespec ts;
- ktime_get_ts(&ts);
-
- return timespec_to_ktime(ts);
-}
-
-/* This works even if 32 bit because of careful byte order choice */
-static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2)
-{
- return cmp1.tv64 < cmp2.tv64;
-}
-
static const char version[] =
"Packet Generator for packet performance testing. "
"Version: " VERSION "\n";
@@ -440,16 +443,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
const char *ifname, bool exact);
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
-static void pktgen_run_all_threads(void);
-static void pktgen_reset_all_threads(void);
-static void pktgen_stop_all_threads_ifs(void);
+static void pktgen_run_all_threads(struct pktgen_net *pn);
+static void pktgen_reset_all_threads(struct pktgen_net *pn);
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
-static unsigned int scan_ip6(const char *s, char ip[16]);
-static unsigned int fmt_ip6(char *s, const char ip[16]);
-
/* Module parameters, defaults. */
static int pg_count_d __read_mostly = 1000;
static int pg_delay_d __read_mostly;
@@ -457,7 +457,6 @@ static int pg_clone_skb_d __read_mostly;
static int debug __read_mostly;
static DEFINE_MUTEX(pktgen_thread_lock);
-static LIST_HEAD(pktgen_threads);
static struct notifier_block pktgen_notifier_block = {
.notifier_call = pktgen_device_event,
@@ -477,44 +476,41 @@ static int pgctrl_show(struct seq_file *seq, void *v)
static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- int err = 0;
char data[128];
+ struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
- if (!capable(CAP_NET_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
if (count > sizeof(data))
count = sizeof(data);
- if (copy_from_user(data, buf, count)) {
- err = -EFAULT;
- goto out;
- }
- data[count - 1] = 0; /* Make string */
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs();
+ pktgen_stop_all_threads_ifs(pn);
else if (!strcmp(data, "start"))
- pktgen_run_all_threads();
+ pktgen_run_all_threads(pn);
else if (!strcmp(data, "reset"))
- pktgen_reset_all_threads();
+ pktgen_reset_all_threads(pn);
else
pr_warning("Unknown command: %s\n", data);
- err = count;
-
-out:
- return err;
+ return count;
}
static int pgctrl_open(struct inode *inode, struct file *file)
{
- return single_open(file, pgctrl_show, PDE(inode)->data);
+ return single_open(file, pgctrl_show, PDE_DATA(inode));
}
static const struct file_operations pktgen_fops = {
@@ -555,21 +551,13 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
pkt_dev->skb_priority);
if (pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128], b3[128];
- fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
- seq_printf(seq,
- " saddr: %s min_saddr: %s max_saddr: %s\n", b1,
- b2, b3);
-
- fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
seq_printf(seq,
- " daddr: %s min_daddr: %s max_daddr: %s\n", b1,
- b2, b3);
-
+ " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
+ " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
+ &pkt_dev->in6_saddr,
+ &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+ &pkt_dev->in6_daddr,
+ &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
} else {
seq_printf(seq,
" dst_min: %s dst_max: %s\n",
@@ -585,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
is_zero_ether_addr(pkt_dev->src_mac) ?
pkt_dev->odev->dev_addr : pkt_dev->src_mac);
- seq_printf(seq, "dst_mac: ");
+ seq_puts(seq, "dst_mac: ");
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
@@ -599,8 +587,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
if (pkt_dev->nr_labels) {
- unsigned i;
- seq_printf(seq, " mpls: ");
+ unsigned int i;
+ seq_puts(seq, " mpls: ");
for (i = 0; i < pkt_dev->nr_labels; i++)
seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
i == pkt_dev->nr_labels-1 ? "\n" : ", ");
@@ -625,66 +613,72 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
- seq_printf(seq, " Flags: ");
+ seq_puts(seq, " Flags: ");
if (pkt_dev->flags & F_IPV6)
- seq_printf(seq, "IPV6 ");
+ seq_puts(seq, "IPV6 ");
if (pkt_dev->flags & F_IPSRC_RND)
- seq_printf(seq, "IPSRC_RND ");
+ seq_puts(seq, "IPSRC_RND ");
if (pkt_dev->flags & F_IPDST_RND)
- seq_printf(seq, "IPDST_RND ");
+ seq_puts(seq, "IPDST_RND ");
if (pkt_dev->flags & F_TXSIZE_RND)
- seq_printf(seq, "TXSIZE_RND ");
+ seq_puts(seq, "TXSIZE_RND ");
if (pkt_dev->flags & F_UDPSRC_RND)
- seq_printf(seq, "UDPSRC_RND ");
+ seq_puts(seq, "UDPSRC_RND ");
if (pkt_dev->flags & F_UDPDST_RND)
- seq_printf(seq, "UDPDST_RND ");
+ seq_puts(seq, "UDPDST_RND ");
+
+ if (pkt_dev->flags & F_UDPCSUM)
+ seq_puts(seq, "UDPCSUM ");
if (pkt_dev->flags & F_MPLS_RND)
- seq_printf(seq, "MPLS_RND ");
+ seq_puts(seq, "MPLS_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_RND)
- seq_printf(seq, "QUEUE_MAP_RND ");
+ seq_puts(seq, "QUEUE_MAP_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
- seq_printf(seq, "QUEUE_MAP_CPU ");
+ seq_puts(seq, "QUEUE_MAP_CPU ");
if (pkt_dev->cflows) {
if (pkt_dev->flags & F_FLOW_SEQ)
- seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/
+ seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/
else
- seq_printf(seq, "FLOW_RND ");
+ seq_puts(seq, "FLOW_RND ");
}
#ifdef CONFIG_XFRM
- if (pkt_dev->flags & F_IPSEC_ON)
- seq_printf(seq, "IPSEC ");
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ seq_puts(seq, "IPSEC ");
+ if (pkt_dev->spi)
+ seq_printf(seq, "spi:%u", pkt_dev->spi);
+ }
#endif
if (pkt_dev->flags & F_MACSRC_RND)
- seq_printf(seq, "MACSRC_RND ");
+ seq_puts(seq, "MACSRC_RND ");
if (pkt_dev->flags & F_MACDST_RND)
- seq_printf(seq, "MACDST_RND ");
+ seq_puts(seq, "MACDST_RND ");
if (pkt_dev->flags & F_VID_RND)
- seq_printf(seq, "VID_RND ");
+ seq_puts(seq, "VID_RND ");
if (pkt_dev->flags & F_SVID_RND)
- seq_printf(seq, "SVID_RND ");
+ seq_puts(seq, "SVID_RND ");
if (pkt_dev->flags & F_NODE)
- seq_printf(seq, "NODE_ALLOC ");
+ seq_puts(seq, "NODE_ALLOC ");
seq_puts(seq, "\n");
/* not really stopped, more like last-running-at */
- stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at;
+ stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
idle = pkt_dev->idle_acc;
do_div(idle, NSEC_PER_USEC);
@@ -705,13 +699,12 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
pkt_dev->cur_src_mac_offset);
if (pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128];
- fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
- seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
+ seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
+ &pkt_dev->cur_in6_saddr,
+ &pkt_dev->cur_in6_daddr);
} else
- seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
- pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+ seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n",
+ &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
@@ -723,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->result[0])
seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- seq_printf(seq, "Result: Idle\n");
+ seq_puts(seq, "Result: Idle\n");
return 0;
}
@@ -775,8 +768,8 @@ done:
return i;
}
-static unsigned long num_arg(const char __user * user_buffer,
- unsigned long maxlen, unsigned long *num)
+static long num_arg(const char __user *user_buffer, unsigned long maxlen,
+ unsigned long *num)
{
int i;
*num = 0;
@@ -820,7 +813,7 @@ done_str:
static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
{
- unsigned n = 0;
+ unsigned int n = 0;
char c;
ssize_t i = 0;
int len;
@@ -899,8 +892,8 @@ static ssize_t pktgen_if_write(struct file *file,
if (copy_from_user(tb, user_buffer, copy))
return -EFAULT;
tb[copy] = 0;
- printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name,
- (unsigned long)count, tb);
+ pr_debug("%s,%lu buffer -:%s:-\n",
+ name, (unsigned long)count, tb);
}
if (!strcmp(name, "min_pkt_size")) {
@@ -1078,7 +1071,9 @@ static ssize_t pktgen_if_write(struct file *file,
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
return len;
-
+ if ((value > 0) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -ENOTSUPP;
i += len;
pkt_dev->clone_skb = value;
@@ -1134,6 +1129,10 @@ static ssize_t pktgen_if_write(struct file *file,
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
}
else
sprintf(pg_result, "ERROR: node not possible");
@@ -1237,12 +1236,24 @@ static ssize_t pktgen_if_write(struct file *file,
else if (strcmp(f, "!NODE_ALLOC") == 0)
pkt_dev->flags &= ~F_NODE;
+ else if (strcmp(f, "UDPCSUM") == 0)
+ pkt_dev->flags |= F_UDPCSUM;
+
+ else if (strcmp(f, "!UDPCSUM") == 0)
+ pkt_dev->flags &= ~F_UDPCSUM;
+
else {
sprintf(pg_result,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
f,
"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n");
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
return count;
}
sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
@@ -1263,8 +1274,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
if (debug)
- printk(KERN_DEBUG "pktgen: dst_min set to: %s\n",
- pkt_dev->dst_min);
+ pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
i += len;
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
@@ -1286,8 +1296,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
if (debug)
- printk(KERN_DEBUG "pktgen: dst_max set to: %s\n",
- pkt_dev->dst_max);
+ pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
i += len;
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
@@ -1303,13 +1312,13 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
if (debug)
- printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
+ pr_debug("dst6 set to: %s\n", buf);
i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
@@ -1326,13 +1335,12 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
- &pkt_dev->min_in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
if (debug)
- printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
+ pr_debug("dst6_min set to: %s\n", buf);
i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
@@ -1349,11 +1357,11 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
if (debug)
- printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf);
+ pr_debug("dst6_max set to: %s\n", buf);
i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
@@ -1370,13 +1378,13 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
if (debug)
- printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
+ pr_debug("src6 set to: %s\n", buf);
i += len;
sprintf(pg_result, "OK: src6=%s", buf);
@@ -1397,8 +1405,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
if (debug)
- printk(KERN_DEBUG "pktgen: src_min set to: %s\n",
- pkt_dev->src_min);
+ pr_debug("src_min set to: %s\n", pkt_dev->src_min);
i += len;
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
@@ -1418,18 +1425,12 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
if (debug)
- printk(KERN_DEBUG "pktgen: src_max set to: %s\n",
- pkt_dev->src_max);
+ pr_debug("src_max set to: %s\n", pkt_dev->src_max);
i += len;
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- char *v = valstr;
- unsigned char old_dmac[ETH_ALEN];
- unsigned char *m = pkt_dev->dst_mac;
- memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN);
-
len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
if (len < 0)
return len;
@@ -1437,35 +1438,16 @@ static ssize_t pktgen_if_write(struct file *file,
memset(valstr, 0, sizeof(valstr));
if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
-
- for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) {
- int value;
-
- value = hex_to_bin(*v);
- if (value >= 0)
- *m = *m * 16 + value;
-
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->dst_mac))
+ return -EINVAL;
/* Set up Dest MAC */
- if (compare_ether_addr(old_dmac, pkt_dev->dst_mac))
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+ ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);
- sprintf(pg_result, "OK: dstmac");
+ sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
return count;
}
if (!strcmp(name, "src_mac")) {
- char *v = valstr;
- unsigned char old_smac[ETH_ALEN];
- unsigned char *m = pkt_dev->src_mac;
-
- memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN);
-
len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
if (len < 0)
return len;
@@ -1473,26 +1455,13 @@ static ssize_t pktgen_if_write(struct file *file,
memset(valstr, 0, sizeof(valstr));
if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
-
- for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) {
- int value;
-
- value = hex_to_bin(*v);
- if (value >= 0)
- *m = *m * 16 + value;
-
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->src_mac))
+ return -EINVAL;
/* Set up Src MAC */
- if (compare_ether_addr(old_smac, pkt_dev->src_mac))
- memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN);
+ ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);
- sprintf(pg_result, "OK: srcmac");
+ sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
return count;
}
@@ -1515,7 +1484,18 @@ static ssize_t pktgen_if_write(struct file *file,
sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
return count;
}
+#ifdef CONFIG_XFRM
+ if (!strcmp(name, "spi")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+ i += len;
+ pkt_dev->spi = value;
+ sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
+ return count;
+ }
+#endif
if (!strcmp(name, "flowlen")) {
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
@@ -1550,7 +1530,7 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "mpls")) {
- unsigned n, cnt;
+ unsigned int n, cnt;
len = get_labels(&user_buffer[i], pkt_dev);
if (len < 0)
@@ -1567,7 +1547,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n");
+ pr_debug("VLAN/SVLAN auto turned off\n");
}
return count;
}
@@ -1582,10 +1562,10 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->vlan_id = value; /* turn on VLAN */
if (debug)
- printk(KERN_DEBUG "pktgen: VLAN turned on\n");
+ pr_debug("VLAN turned on\n");
if (debug && pkt_dev->nr_labels)
- printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+ pr_debug("MPLS auto turned off\n");
pkt_dev->nr_labels = 0; /* turn off MPLS */
sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
@@ -1594,7 +1574,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+ pr_debug("VLAN/SVLAN turned off\n");
}
return count;
}
@@ -1639,10 +1619,10 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = value; /* turn on SVLAN */
if (debug)
- printk(KERN_DEBUG "pktgen: SVLAN turned on\n");
+ pr_debug("SVLAN turned on\n");
if (debug && pkt_dev->nr_labels)
- printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+ pr_debug("MPLS auto turned off\n");
pkt_dev->nr_labels = 0; /* turn off MPLS */
sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
@@ -1651,7 +1631,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->svlan_id = 0xffff;
if (debug)
- printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+ pr_debug("VLAN/SVLAN turned off\n");
}
return count;
}
@@ -1736,7 +1716,7 @@ static ssize_t pktgen_if_write(struct file *file,
static int pktgen_if_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_if_show, PDE(inode)->data);
+ return single_open(file, pktgen_if_show, PDE_DATA(inode));
}
static const struct file_operations pktgen_if_fops = {
@@ -1755,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
BUG_ON(!t);
- seq_printf(seq, "Running: ");
+ seq_puts(seq, "Running: ");
if_lock(t);
list_for_each_entry(pkt_dev, &t->if_list, list)
if (pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
- seq_printf(seq, "\nStopped: ");
+ seq_puts(seq, "\nStopped: ");
list_for_each_entry(pkt_dev, &t->if_list, list)
if (!pkt_dev->running)
@@ -1771,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
if (t->result[0])
seq_printf(seq, "\nResult: %s\n", t->result);
else
- seq_printf(seq, "\nResult: NA\n");
+ seq_puts(seq, "\nResult: NA\n");
if_unlock(t);
@@ -1819,8 +1799,7 @@ static ssize_t pktgen_thread_write(struct file *file,
i += len;
if (debug)
- printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n",
- name, (unsigned long)count);
+ pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);
if (!t) {
pr_err("ERROR: No thread\n");
@@ -1842,10 +1821,13 @@ static ssize_t pktgen_thread_write(struct file *file,
return -EFAULT;
i += len;
mutex_lock(&pktgen_thread_lock);
- pktgen_add_device(t, f);
+ ret = pktgen_add_device(t, f);
mutex_unlock(&pktgen_thread_lock);
- ret = count;
- sprintf(pg_result, "OK: add_device=%s", f);
+ if (!ret) {
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ } else
+ sprintf(pg_result, "ERROR: can not add device %s", f);
goto out;
}
@@ -1872,7 +1854,7 @@ out:
static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_thread_show, PDE(inode)->data);
+ return single_open(file, pktgen_thread_show, PDE_DATA(inode));
}
static const struct file_operations pktgen_thread_fops = {
@@ -1885,13 +1867,14 @@ static const struct file_operations pktgen_thread_fops = {
};
/* Think find or remove for NN */
-static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
+static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
+ const char *ifname, int remove)
{
struct pktgen_thread *t;
struct pktgen_dev *pkt_dev = NULL;
bool exact = (remove == FIND);
- list_for_each_entry(t, &pktgen_threads, th_list) {
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
pkt_dev = pktgen_find_dev(t, ifname, exact);
if (pkt_dev) {
if (remove) {
@@ -1909,7 +1892,7 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
/*
* mark a device for removal
*/
-static void pktgen_mark_device(const char *ifname)
+static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
{
struct pktgen_dev *pkt_dev = NULL;
const int max_tries = 10, msec_per_try = 125;
@@ -1920,7 +1903,7 @@ static void pktgen_mark_device(const char *ifname)
while (1) {
- pkt_dev = __pktgen_NN_threads(ifname, REMOVE);
+ pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);
if (pkt_dev == NULL)
break; /* success */
@@ -1941,21 +1924,21 @@ static void pktgen_mark_device(const char *ifname)
mutex_unlock(&pktgen_thread_lock);
}
-static void pktgen_change_name(struct net_device *dev)
+static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)
{
struct pktgen_thread *t;
- list_for_each_entry(t, &pktgen_threads, th_list) {
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
struct pktgen_dev *pkt_dev;
list_for_each_entry(pkt_dev, &t->if_list, list) {
if (pkt_dev->odev != dev)
continue;
- remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+ proc_remove(pkt_dev->entry);
pkt_dev->entry = proc_create_data(dev->name, 0600,
- pg_proc_dir,
+ pn->proc_dir,
&pktgen_if_fops,
pkt_dev);
if (!pkt_dev->entry)
@@ -1969,9 +1952,10 @@ static void pktgen_change_name(struct net_device *dev)
static int pktgen_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id);
- if (!net_eq(dev_net(dev), &init_net))
+ if (pn->pktgen_exiting)
return NOTIFY_DONE;
/* It is OK that we do not hold the group lock right now,
@@ -1980,18 +1964,19 @@ static int pktgen_device_event(struct notifier_block *unused,
switch (event) {
case NETDEV_CHANGENAME:
- pktgen_change_name(dev);
+ pktgen_change_name(pn, dev);
break;
case NETDEV_UNREGISTER:
- pktgen_mark_device(dev->name);
+ pktgen_mark_device(pn, dev->name);
break;
}
return NOTIFY_DONE;
}
-static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
+static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev,
const char *ifname)
{
char b[IFNAMSIZ+5];
@@ -2005,13 +1990,14 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
}
b[i] = 0;
- return dev_get_by_name(&init_net, b);
+ return dev_get_by_name(pn->net, b);
}
/* Associate pktgen_dev with a device. */
-static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)
+static int pktgen_setup_dev(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev, const char *ifname)
{
struct net_device *odev;
int err;
@@ -2022,7 +2008,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)
pkt_dev->odev = NULL;
}
- odev = pktgen_dev_get_by_name(pkt_dev, ifname);
+ odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);
if (!odev) {
pr_err("no such netdevice: \"%s\"\n", ifname);
return -ENODEV;
@@ -2064,36 +2050,34 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
pkt_dev->odevname);
- pkt_dev->queue_map_min = ntxq - 1;
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
}
if (pkt_dev->queue_map_max >= ntxq) {
pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
pkt_dev->odevname);
- pkt_dev->queue_map_max = ntxq - 1;
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
}
/* Default to the interface's mac if not explicitly set. */
if (is_zero_ether_addr(pkt_dev->src_mac))
- memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN);
+ ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);
/* Set up Dest MAC */
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
-
- /* Set up pkt size */
- pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);
if (pkt_dev->flags & F_IPV6) {
- /*
- * Skip this automatic address setting until locks or functions
- * gets exported
- */
-
-#ifdef NOTNOW
int i, set = 0, err = 1;
struct inet6_dev *idev;
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
for (i = 0; i < IN6_ADDR_HSIZE; i++)
if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
set = 1;
@@ -2114,13 +2098,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
- for (ifp = idev->addr_list; ifp;
- ifp = ifp->if_next) {
- if (ifp->scope == IFA_LINK &&
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if ((ifp->scope & IFA_LINK) &&
!(ifp->flags & IFA_F_TENTATIVE)) {
- ipv6_addr_copy(&pkt_dev->
- cur_in6_saddr,
- &ifp->addr);
+ pkt_dev->cur_in6_saddr = ifp->addr;
err = 0;
break;
}
@@ -2131,8 +2112,14 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
if (err)
pr_err("ERROR: IPv6 link address not available\n");
}
-#endif
} else {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
pkt_dev->saddr_min = 0;
pkt_dev->saddr_max = 0;
if (strlen(pkt_dev->src_min) == 0) {
@@ -2158,6 +2145,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
}
/* Initialize current values. */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+ pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
pkt_dev->cur_dst_mac_offset = 0;
pkt_dev->cur_src_mac_offset = 0;
pkt_dev->cur_saddr = pkt_dev->saddr_min;
@@ -2183,10 +2174,13 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
return;
}
- start_time = ktime_now();
- if (remaining < 100000)
- ndelay(remaining); /* really small just spin */
- else {
+ start_time = ktime_get();
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_get();
+ } while (ktime_compare(end_time, spin_until) < 0);
+ } else {
/* see do_nanosleep */
hrtimer_init_sleeper(&t, current);
do {
@@ -2201,8 +2195,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
hrtimer_cancel(&t.timer);
} while (t.task && pkt_dev->running && !signal_pending(current));
__set_current_state(TASK_RUNNING);
+ end_time = ktime_get();
}
- end_time = ktime_now();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
@@ -2235,7 +2229,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
pkt_dev->curfl = 0; /*reset */
}
} else {
- flow = random32() % pkt_dev->cflows;
+ flow = prandom_u32() % pkt_dev->cflows;
pkt_dev->curfl = flow;
if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
@@ -2256,14 +2250,23 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
{
struct xfrm_state *x = pkt_dev->flows[flow].x;
+ struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
if (!x) {
- /*slow path: we dont already have xfrm_state*/
- x = xfrm_stateonly_find(&init_net, DUMMY_MARK,
- (xfrm_address_t *)&pkt_dev->cur_daddr,
- (xfrm_address_t *)&pkt_dev->cur_saddr,
- AF_INET,
- pkt_dev->ipsmode,
- pkt_dev->ipsproto, 0);
+
+ if (pkt_dev->spi) {
+ /* We need as quick as possible to find the right SA
+ * Searching with minimum criteria to archieve this.
+ */
+ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
+ } else {
+ /* slow path: we dont already have xfrm_state */
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ (xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ }
if (x) {
pkt_dev->flows[flow].x = x;
set_pkt_overhead(pkt_dev);
@@ -2282,7 +2285,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
- t = random32() %
+ t = prandom_u32() %
(pkt_dev->queue_map_max -
pkt_dev->queue_map_min + 1)
+ pkt_dev->queue_map_min;
@@ -2314,7 +2317,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = random32() % pkt_dev->src_mac_count;
+ mc = prandom_u32() % pkt_dev->src_mac_count;
else {
mc = pkt_dev->cur_src_mac_offset++;
if (pkt_dev->cur_src_mac_offset >=
@@ -2340,7 +2343,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = random32() % pkt_dev->dst_mac_count;
+ mc = prandom_u32() % pkt_dev->dst_mac_count;
else {
mc = pkt_dev->cur_dst_mac_offset++;
@@ -2363,25 +2366,25 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
}
if (pkt_dev->flags & F_MPLS_RND) {
- unsigned i;
+ unsigned int i;
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)random32() &
+ ((__force __be32)prandom_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = random32() & (4096-1);
+ pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = random32() & (4096 - 1);
+ pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = random32() %
+ pkt_dev->cur_udp_src = prandom_u32() %
(pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+ pkt_dev->udp_src_min;
@@ -2394,7 +2397,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = random32() %
+ pkt_dev->cur_udp_dst = prandom_u32() %
(pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+ pkt_dev->udp_dst_min;
} else {
@@ -2411,7 +2414,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
if (pkt_dev->flags & F_IPSRC_RND)
- t = random32() % (imx - imn) + imn;
+ t = prandom_u32() % (imx - imn) + imn;
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
@@ -2432,17 +2435,15 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__be32 s;
if (pkt_dev->flags & F_IPDST_RND) {
- t = random32() % (imx - imn) + imn;
- s = htonl(t);
-
- while (ipv4_is_loopback(s) ||
- ipv4_is_multicast(s) ||
- ipv4_is_lbcast(s) ||
- ipv4_is_zeronet(s) ||
- ipv4_is_local_multicast(s)) {
- t = random32() % (imx - imn) + imn;
+ do {
+ t = prandom_u32() %
+ (imx - imn) + imn;
s = htonl(t);
- }
+ } while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s));
pkt_dev->cur_daddr = s;
} else {
t = ntohl(pkt_dev->cur_daddr);
@@ -2466,18 +2467,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
}
} else { /* IPV6 * */
- if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ;
- else {
+ if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
int i;
/* Only random destinations yet */
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)random32() |
+ (((__force __be32)prandom_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2487,7 +2484,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = random32() %
+ t = prandom_u32() %
(pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+ pkt_dev->min_pkt_size;
} else {
@@ -2505,33 +2502,47 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
#ifdef CONFIG_XFRM
+static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
+
+ [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */
+};
+
static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
{
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int err = 0;
- struct iphdr *iph;
+ struct net *net = dev_net(pkt_dev->odev);
if (!x)
return 0;
/* XXX: we dont support tunnel mode for now until
* we resolve the dst issue */
- if (x->props.mode != XFRM_MODE_TRANSPORT)
+ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
return 0;
- spin_lock(&x->lock);
- iph = ip_hdr(skb);
+ /* But when user specify an valid SPI, transformation
+ * supports both transport/tunnel mode + ESP/AH type.
+ */
+ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
+ skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+ rcu_read_lock_bh();
err = x->outer_mode->output(x, skb);
- if (err)
+ rcu_read_unlock_bh();
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
goto error;
+ }
err = x->type->output(x, skb);
- if (err)
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
goto error;
-
+ }
+ spin_lock_bh(&x->lock);
x->curlft.bytes += skb->len;
x->curlft.packets++;
+ spin_unlock_bh(&x->lock);
error:
- spin_unlock(&x->lock);
return err;
}
@@ -2559,6 +2570,8 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
if (x) {
int ret;
__u8 *eth;
+ struct iphdr *iph;
+
nhead = x->props.header_len - skb_headroom(skb);
if (nhead > 0) {
ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
@@ -2580,6 +2593,11 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
eth = (__u8 *) skb_push(skb, ETH_HLEN);
memcpy(eth, pkt_dev->hh, 12);
*(u16 *) &eth[12] = protocol;
+
+ /* Update IPv4 header len as well as checksum value */
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len - ETH_HLEN);
+ ip_send_check(iph);
}
}
return 1;
@@ -2591,7 +2609,7 @@ err:
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
@@ -2605,6 +2623,95 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
return htons(id | (cfi << 12) | (prio << 13));
}
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timeval timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ memset(skb_put(skb, datalen), 0, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+ int frag_len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ memset(skb_put(skb, len), 0, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ frag_len = (datalen/frags) < PAGE_SIZE ?
+ (datalen/frags) : PAGE_SIZE;
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ get_page(pkt_dev->page);
+ skb_frag_set_page(skb, i, pkt_dev->page);
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ /*last fragment, fill rest of data*/
+ if (i == (frags - 1))
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i],
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ else
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+}
+
+static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
+ struct pktgen_dev *pkt_dev,
+ unsigned int extralen)
+{
+ struct sk_buff *skb = NULL;
+ unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
+ pkt_dev->pkt_overhead;
+
+ if (pkt_dev->flags & F_NODE) {
+ int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
+
+ skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ } else {
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ }
+
+ return skb;
+}
+
static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
@@ -2613,7 +2720,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct udphdr *udph;
int datalen, iplen;
struct iphdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IP);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2636,31 +2742,13 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
datalen = (odev->hard_header_len + 16) & ~0xf;
- if (pkt_dev->flags & F_NODE) {
- int node;
-
- if (pkt_dev->node >= 0)
- node = pkt_dev->node;
- else
- node = numa_node_id();
-
- skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64
- + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node);
- if (likely(skb)) {
- skb_reserve(skb, NET_SKB_PAD);
- skb->dev = odev;
- }
- }
- else
- skb = __netdev_alloc_skb(odev,
- pkt_dev->cur_pkt_size + 64
- + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT);
-
+ skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
+ prefetchw(skb->data);
skb_reserve(skb, datalen);
/* Reserve for ethernet and IP header */
@@ -2686,28 +2774,28 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IP);
}
- skb->network_header = skb->tail;
- skb->transport_header = skb->network_header + sizeof(struct iphdr);
- skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr));
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
skb_set_queue_mapping(skb, queue_map);
skb->priority = pkt_dev->skb_priority;
- iph = ip_hdr(skb);
- udph = udp_hdr(skb);
-
memcpy(eth, pkt_dev->hh, 12);
*(__be16 *) & eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
pkt_dev->pkt_overhead;
- if (datalen < sizeof(struct pktgen_hdr))
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))
datalen = sizeof(struct pktgen_hdr);
udph->source = htons(pkt_dev->cur_udp_src);
udph->dest = htons(pkt_dev->cur_udp_dst);
udph->len = htons(datalen + 8); /* DATA + udphdr */
- udph->check = 0; /* No checksum */
+ udph->check = 0;
iph->ihl = 5;
iph->version = 4;
@@ -2721,83 +2809,29 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
iph->frag_off = 0;
iplen = 20 + 8 + datalen;
iph->tot_len = htons(iplen);
- iph->check = 0;
- iph->check = ip_fast_csum((void *)iph, iph->ihl);
+ ip_send_check(iph);
skb->protocol = protocol;
- skb->mac_header = (skb->network_header - ETH_HLEN -
- pkt_dev->pkt_overhead);
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- if (pkt_dev->nfrags <= 0) {
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+ udp4_hwcsum(skb, udph->source, udph->dest);
} else {
- int frags = pkt_dev->nfrags;
- int i, len;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- len = datalen - frags * PAGE_SIZE;
- memset(skb_put(skb, len), 0, len);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
+ __wsum csum = udp_csum(skb);
- skb_shinfo(skb)->frags[i - 1].size -= rem;
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_tcpudp_magic(udph->source, udph->dest,
+ datalen + 8, IPPROTO_UDP, csum);
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
#ifdef CONFIG_XFRM
if (!process_ipsec(pkt_dev, skb, protocol))
@@ -2807,179 +2841,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
return skb;
}
-/*
- * scan_ip6, fmt_ip taken from dietlibc-0.21
- * Author Felix von Leitner <felix-dietlibc@fefe.de>
- *
- * Slightly modified for kernel.
- * Should be candidate for net/ipv4/utils.c
- * --ro
- */
-
-static unsigned int scan_ip6(const char *s, char ip[16])
-{
- unsigned int i;
- unsigned int len = 0;
- unsigned long u;
- char suffix[16];
- unsigned int prefixlen = 0;
- unsigned int suffixlen = 0;
- __be32 tmp;
- char *pos;
-
- for (i = 0; i < 16; i++)
- ip[i] = 0;
-
- for (;;) {
- if (*s == ':') {
- len++;
- if (s[1] == ':') { /* Found "::", skip to part 2 */
- s += 2;
- len++;
- break;
- }
- s++;
- }
-
- u = simple_strtoul(s, &pos, 16);
- i = pos - s;
- if (!i)
- return 0;
- if (prefixlen == 12 && s[i] == '.') {
-
- /* the last 4 bytes may be written as IPv4 address */
-
- tmp = in_aton(s);
- memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp));
- return i + len;
- }
- ip[prefixlen++] = (u >> 8);
- ip[prefixlen++] = (u & 255);
- s += i;
- len += i;
- if (prefixlen == 16)
- return len;
- }
-
-/* part 2, after "::" */
- for (;;) {
- if (*s == ':') {
- if (suffixlen == 0)
- break;
- s++;
- len++;
- } else if (suffixlen != 0)
- break;
-
- u = simple_strtol(s, &pos, 16);
- i = pos - s;
- if (!i) {
- if (*s)
- len--;
- break;
- }
- if (suffixlen + prefixlen <= 12 && s[i] == '.') {
- tmp = in_aton(s);
- memcpy((struct in_addr *)(suffix + suffixlen), &tmp,
- sizeof(tmp));
- suffixlen += 4;
- len += strlen(s);
- break;
- }
- suffix[suffixlen++] = (u >> 8);
- suffix[suffixlen++] = (u & 255);
- s += i;
- len += i;
- if (prefixlen + suffixlen == 16)
- break;
- }
- for (i = 0; i < suffixlen; i++)
- ip[16 - suffixlen + i] = suffix[i];
- return len;
-}
-
-static char tohex(char hexdigit)
-{
- return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0';
-}
-
-static int fmt_xlong(char *s, unsigned int i)
-{
- char *bak = s;
- *s = tohex((i >> 12) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex((i >> 8) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex((i >> 4) & 0xf);
- if (s != bak || *s != '0')
- ++s;
- *s = tohex(i & 0xf);
- return s - bak + 1;
-}
-
-static unsigned int fmt_ip6(char *s, const char ip[16])
-{
- unsigned int len;
- unsigned int i;
- unsigned int temp;
- unsigned int compressing;
- int j;
-
- len = 0;
- compressing = 0;
- for (j = 0; j < 16; j += 2) {
-
-#ifdef V4MAPPEDPREFIX
- if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) {
- inet_ntoa_r(*(struct in_addr *)(ip + 12), s);
- temp = strlen(s);
- return len + temp;
- }
-#endif
- temp = ((unsigned long)(unsigned char)ip[j] << 8) +
- (unsigned long)(unsigned char)ip[j + 1];
- if (temp == 0) {
- if (!compressing) {
- compressing = 1;
- if (j == 0) {
- *s++ = ':';
- ++len;
- }
- }
- } else {
- if (compressing) {
- compressing = 0;
- *s++ = ':';
- ++len;
- }
- i = fmt_xlong(s, temp);
- len += i;
- s += i;
- if (j < 14) {
- *s++ = ':';
- ++len;
- }
- }
- }
- if (compressing) {
- *s++ = ':';
- ++len;
- }
- *s = 0;
- return len;
-}
-
static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
struct sk_buff *skb = NULL;
__u8 *eth;
struct udphdr *udph;
- int datalen;
+ int datalen, udplen;
struct ipv6hdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IPV6);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -3000,14 +2869,13 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- skb = __netdev_alloc_skb(odev,
- pkt_dev->cur_pkt_size + 64
- + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT);
+ skb = pktgen_alloc_skb(odev, pkt_dev, 16);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
+ prefetchw(skb->data);
skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
@@ -3033,13 +2901,14 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
- skb->network_header = skb->tail;
- skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
- skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr));
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
skb_set_queue_mapping(skb, queue_map);
skb->priority = pkt_dev->skb_priority;
- iph = ipv6_hdr(skb);
- udph = udp_hdr(skb);
memcpy(eth, pkt_dev->hh, 12);
*(__be16 *) &eth[12] = protocol;
@@ -3049,16 +2918,16 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
pkt_dev->pkt_overhead;
- if (datalen < sizeof(struct pktgen_hdr)) {
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
datalen = sizeof(struct pktgen_hdr);
- if (net_ratelimit())
- pr_info("increased datalen to %d\n", datalen);
+ net_info_ratelimited("increased datalen to %d\n", datalen);
}
+ udplen = datalen + sizeof(struct udphdr);
udph->source = htons(pkt_dev->cur_udp_src);
udph->dest = htons(pkt_dev->cur_udp_dst);
- udph->len = htons(datalen + sizeof(struct udphdr));
- udph->check = 0; /* No checksum */
+ udph->len = htons(udplen);
+ udph->check = 0;
*(__be32 *) iph = htonl(0x60000000); /* Version + flow */
@@ -3069,87 +2938,34 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
iph->hop_limit = 32;
- iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+ iph->payload_len = htons(udplen);
iph->nexthdr = IPPROTO_UDP;
- ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
- ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
- skb->mac_header = (skb->network_header - ETH_HLEN -
- pkt_dev->pkt_overhead);
skb->protocol = protocol;
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- skb_put(skb, datalen - frags * PAGE_SIZE);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V6_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+ } else {
+ __wsum csum = udp_csum(skb);
- skb_shinfo(skb)->frags[i - 1].size -= rem;
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- * should we update cloned packets too ?
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
- /* pkt_dev->seq_num++; FF: you really mean this? */
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
return skb;
}
@@ -3193,8 +3009,7 @@ static void pktgen_run(struct pktgen_thread *t)
pktgen_clear_counters(pkt_dev);
pkt_dev->running = 1; /* Cranke yeself! */
pkt_dev->skb = NULL;
- pkt_dev->started_at =
- pkt_dev->next_tx = ktime_now();
+ pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
set_pkt_overhead(pkt_dev);
@@ -3208,7 +3023,7 @@ static void pktgen_run(struct pktgen_thread *t)
t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(void)
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
{
struct pktgen_thread *t;
@@ -3216,7 +3031,7 @@ static void pktgen_stop_all_threads_ifs(void)
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= T_STOP;
mutex_unlock(&pktgen_thread_lock);
@@ -3252,28 +3067,28 @@ signal:
return 0;
}
-static int pktgen_wait_all_threads_run(void)
+static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
{
struct pktgen_thread *t;
int sig = 1;
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list) {
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
sig = pktgen_wait_thread_run(t);
if (sig == 0)
break;
}
if (sig == 0)
- list_for_each_entry(t, &pktgen_threads, th_list)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_STOP);
mutex_unlock(&pktgen_thread_lock);
return sig;
}
-static void pktgen_run_all_threads(void)
+static void pktgen_run_all_threads(struct pktgen_net *pn)
{
struct pktgen_thread *t;
@@ -3281,7 +3096,7 @@ static void pktgen_run_all_threads(void)
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_RUN);
mutex_unlock(&pktgen_thread_lock);
@@ -3289,10 +3104,10 @@ static void pktgen_run_all_threads(void)
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
- pktgen_wait_all_threads_run();
+ pktgen_wait_all_threads_run(pn);
}
-static void pktgen_reset_all_threads(void)
+static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
struct pktgen_thread *t;
@@ -3300,7 +3115,7 @@ static void pktgen_reset_all_threads(void)
mutex_lock(&pktgen_thread_lock);
- list_for_each_entry(t, &pktgen_threads, th_list)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_REMDEVALL);
mutex_unlock(&pktgen_thread_lock);
@@ -3308,7 +3123,7 @@ static void pktgen_reset_all_threads(void)
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
- pktgen_wait_all_threads_run();
+ pktgen_wait_all_threads_run(pn);
}
static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
@@ -3319,7 +3134,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
pkt_dev->started_at);
ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
- p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n",
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
(unsigned long long)ktime_to_us(elapsed),
(unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
(unsigned long long)ktime_to_us(idle),
@@ -3353,7 +3168,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
- pkt_dev->stopped_at = ktime_now();
+ pkt_dev->stopped_at = ktime_get();
pkt_dev->running = 0;
show_results(pkt_dev, nr_frags);
@@ -3372,7 +3187,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
continue;
if (best == NULL)
best = pkt_dev;
- else if (ktime_lt(pkt_dev->next_tx, best->next_tx))
+ else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
best = pkt_dev;
}
if_unlock(t);
@@ -3450,21 +3265,19 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
static void pktgen_rem_thread(struct pktgen_thread *t)
{
/* Remove from the thread list */
-
- remove_proc_entry(t->tsk->comm, pg_proc_dir);
-
+ remove_proc_entry(t->tsk->comm, t->net->proc_dir);
}
static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
- ktime_t idle_start = ktime_now();
+ ktime_t idle_start = ktime_get();
schedule();
- pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
{
- ktime_t idle_start = ktime_now();
+ ktime_t idle_start = ktime_get();
while (atomic_read(&(pkt_dev->skb->users)) != 1) {
if (signal_pending(current))
@@ -3475,7 +3288,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
else
cpu_relax();
}
- pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
@@ -3497,7 +3310,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
* "never transmit"
*/
if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
- pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX);
+ pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
return;
}
@@ -3525,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
queue_map = skb_get_queue_mapping(pkt_dev->skb);
txq = netdev_get_tx_queue(odev, queue_map);
- __netif_tx_lock_bh(txq);
+ local_bh_disable();
- if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
@@ -3550,9 +3365,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
break;
default: /* Drivers are not supposed to return other values! */
- if (net_ratelimit())
- pr_info("pktgen: %s xmit error: %d\n",
- pkt_dev->odevname, ret);
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
pkt_dev->errors++;
/* fallthru */
case NETDEV_TX_LOCKED:
@@ -3562,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
}
unlock:
- __netif_tx_unlock_bh(txq);
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
@@ -3599,7 +3415,7 @@ static int pktgen_thread_worker(void *arg)
pkt_dev = next_to_run(t);
if (unlikely(!pkt_dev && t->control == 0)) {
- if (pktgen_exiting)
+ if (t->net->pktgen_exiting)
break;
wait_event_interruptible_timeout(t->queue,
t->control != 0,
@@ -3721,7 +3537,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
/* We don't allow a device to be on several threads */
- pkt_dev = __pktgen_NN_threads(ifname, FIND);
+ pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);
if (pkt_dev) {
pr_err("ERROR: interface already used\n");
return -EBUSY;
@@ -3732,19 +3548,15 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
return -ENOMEM;
strcpy(pkt_dev->odevname, ifname);
- pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
+ pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
node);
if (pkt_dev->flows == NULL) {
kfree(pkt_dev);
return -ENOMEM;
}
- memset(pkt_dev->flows, 0, MAX_CFLOWS * sizeof(struct flow_state));
pkt_dev->removal_mark = 0;
- pkt_dev->min_pkt_size = ETH_ZLEN;
- pkt_dev->max_pkt_size = ETH_ZLEN;
pkt_dev->nfrags = 0;
- pkt_dev->clone_skb = pg_clone_skb_d;
pkt_dev->delay = pg_delay_d;
pkt_dev->count = pg_count_d;
pkt_dev->sofar = 0;
@@ -3752,7 +3564,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->udp_src_max = 9;
pkt_dev->udp_dst_min = 9;
pkt_dev->udp_dst_max = 9;
-
pkt_dev->vlan_p = 0;
pkt_dev->vlan_cfi = 0;
pkt_dev->vlan_id = 0xffff;
@@ -3761,11 +3572,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_id = 0xffff;
pkt_dev->node = -1;
- err = pktgen_setup_dev(pkt_dev, ifname);
+ err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
goto out1;
+ if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+ pkt_dev->clone_skb = pg_clone_skb_d;
- pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir,
+ pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
&pktgen_if_fops, pkt_dev);
if (!pkt_dev->entry) {
pr_err("cannot create %s/%s procfs entry\n",
@@ -3776,6 +3589,17 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
#ifdef CONFIG_XFRM
pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
pkt_dev->ipsproto = IPPROTO_ESP;
+
+ /* xfrm tunnel mode needs additional dst to extract outter
+ * ip header protocol/ttl/id field, here creat a phony one.
+ * instead of looking for a valid rt, which definitely hurting
+ * performance under such circumstance.
+ */
+ pkt_dev->dstops.family = AF_INET;
+ pkt_dev->dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
+ pkt_dev->dst.child = &pkt_dev->dst;
+ pkt_dev->dst.ops = &pkt_dev->dstops;
#endif
return add_dev_to_thread(t, pkt_dev);
@@ -3790,7 +3614,7 @@ out1:
return err;
}
-static int __init pktgen_create_thread(int cpu)
+static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
{
struct pktgen_thread *t;
struct proc_dir_entry *pe;
@@ -3808,10 +3632,13 @@ static int __init pktgen_create_thread(int cpu)
INIT_LIST_HEAD(&t->if_list);
- list_add_tail(&t->th_list, &pktgen_threads);
+ list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu);
+ p = kthread_create_on_node(pktgen_thread_worker,
+ t,
+ cpu_to_node(cpu),
+ "kpktgend_%d", cpu);
if (IS_ERR(p)) {
pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
@@ -3821,7 +3648,7 @@ static int __init pktgen_create_thread(int cpu)
kthread_bind(p, cpu);
t->tsk = p;
- pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir,
+ pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
&pktgen_thread_fops, t);
if (!pe) {
pr_err("cannot create %s/%s procfs entry\n",
@@ -3832,6 +3659,7 @@ static int __init pktgen_create_thread(int cpu)
return -EINVAL;
}
+ t->net = pn;
wake_up_process(p);
wait_for_completion(&t->start_done);
@@ -3857,7 +3685,6 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
static int pktgen_remove_device(struct pktgen_thread *t,
struct pktgen_dev *pkt_dev)
{
-
pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
if (pkt_dev->running) {
@@ -3877,77 +3704,114 @@ static int pktgen_remove_device(struct pktgen_thread *t,
_rem_dev_from_if_list(t, pkt_dev);
if (pkt_dev->entry)
- remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+ proc_remove(pkt_dev->entry);
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
#endif
vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
kfree(pkt_dev);
return 0;
}
-static int __init pg_init(void)
+static int __net_init pg_net_init(struct net *net)
{
- int cpu;
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
struct proc_dir_entry *pe;
-
- pr_info("%s", version);
-
- pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
- if (!pg_proc_dir)
+ int cpu, ret = 0;
+
+ pn->net = net;
+ INIT_LIST_HEAD(&pn->pktgen_threads);
+ pn->pktgen_exiting = false;
+ pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+ if (!pn->proc_dir) {
+ pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
return -ENODEV;
-
- pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
+ }
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
if (pe == NULL) {
- pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL);
- proc_net_remove(&init_net, PG_PROC_DIR);
- return -EINVAL;
+ pr_err("cannot create %s procfs entry\n", PGCTRL);
+ ret = -EINVAL;
+ goto remove;
}
- /* Register us to receive netdevice events */
- register_netdevice_notifier(&pktgen_notifier_block);
-
for_each_online_cpu(cpu) {
int err;
- err = pktgen_create_thread(cpu);
+ err = pktgen_create_thread(cpu, pn);
if (err)
- pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n",
+ pr_warn("Cannot create thread for cpu %d (%d)\n",
cpu, err);
}
- if (list_empty(&pktgen_threads)) {
- pr_err("ERROR: Initialization failed for all threads\n");
- unregister_netdevice_notifier(&pktgen_notifier_block);
- remove_proc_entry(PGCTRL, pg_proc_dir);
- proc_net_remove(&init_net, PG_PROC_DIR);
- return -ENODEV;
+ if (list_empty(&pn->pktgen_threads)) {
+ pr_err("Initialization failed for all threads\n");
+ ret = -ENODEV;
+ goto remove_entry;
}
return 0;
+
+remove_entry:
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+remove:
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+ return ret;
}
-static void __exit pg_cleanup(void)
+static void __net_exit pg_net_exit(struct net *net)
{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
struct pktgen_thread *t;
struct list_head *q, *n;
+ LIST_HEAD(list);
/* Stop all interfaces & threads */
- pktgen_exiting = true;
+ pn->pktgen_exiting = true;
- list_for_each_safe(q, n, &pktgen_threads) {
+ mutex_lock(&pktgen_thread_lock);
+ list_splice_init(&pn->pktgen_threads, &list);
+ mutex_unlock(&pktgen_thread_lock);
+
+ list_for_each_safe(q, n, &list) {
t = list_entry(q, struct pktgen_thread, th_list);
+ list_del(&t->th_list);
kthread_stop(t->tsk);
kfree(t);
}
- /* Un-register us from receiving netdevice events */
- unregister_netdevice_notifier(&pktgen_notifier_block);
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+}
+
+static struct pernet_operations pg_net_ops = {
+ .init = pg_net_init,
+ .exit = pg_net_exit,
+ .id = &pg_net_id,
+ .size = sizeof(struct pktgen_net),
+};
+
+static int __init pg_init(void)
+{
+ int ret = 0;
+
+ pr_info("%s", version);
+ ret = register_pernet_subsys(&pg_net_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&pktgen_notifier_block);
+ if (ret)
+ unregister_pernet_subsys(&pg_net_ops);
- /* Clean up proc file system */
- remove_proc_entry(PGCTRL, pg_proc_dir);
- proc_net_remove(&init_net, PG_PROC_DIR);
+ return ret;
+}
+
+static void __exit pg_cleanup(void)
+{
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ unregister_pernet_subsys(&pg_net_ops);
}
module_init(pg_init);
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 00000000000..d3027a73fd4
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,141 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x40 ; PTP_CLASS_V2_VLAN
+ * ret a ; return PTP class
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x30 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct sk_filter *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return SK_RUN_FILTER(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 15, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 12, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000040 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000030 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 41d99435f62..467f326126e 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tcp.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h>
@@ -26,10 +27,11 @@
* but then some measure against one socket starving all other sockets
* would be needed.
*
- * It was 128 by default. Experiments with real servers show, that
+ * The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
@@ -46,9 +48,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
- lopt = __vmalloc(lopt_size,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ lopt = vzalloc(lopt_size);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
@@ -131,3 +131,94 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
kfree(lopt);
}
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * When a TFO req is created, it needs to sock_hold its listener to prevent
+ * the latter data structure from going away.
+ *
+ * This function also sets "treq->listener" to NULL and unreference listener
+ * socket. treq->listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = tcp_rsk(req)->listener;
+ struct fastopen_queue *fastopenq =
+ inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ tcp_sk(sk)->fastopen_rsk = NULL;
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->listener = NULL;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+ reqsk_free(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 750db57f3bb..1063996f831 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -35,10 +35,11 @@
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/if_addr.h>
+#include <linux/if_bridge.h>
#include <linux/pci.h>
+#include <linux/etherdevice.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -56,6 +57,7 @@
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
};
static DEFINE_MUTEX(rtnl_mutex);
@@ -126,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
if (tab == NULL || tab[msgindex].doit == NULL)
tab = rtnl_msg_handlers[PF_UNSPEC];
- return tab ? tab[msgindex].doit : NULL;
+ return tab[msgindex].doit;
}
static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
@@ -141,7 +143,22 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
if (tab == NULL || tab[msgindex].dumpit == NULL)
tab = rtnl_msg_handlers[PF_UNSPEC];
- return tab ? tab[msgindex].dumpit : NULL;
+ return tab[msgindex].dumpit;
+}
+
+static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].calcit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].calcit;
}
/**
@@ -150,6 +167,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @calcit: Function pointer to calc size of dump message
*
* Registers the specified function pointers (at least one of them has
* to be non-NULL) to be called whenever a request message for the
@@ -162,7 +180,8 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
* Returns 0 on success or a negative error code.
*/
int __rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
{
struct rtnl_link *tab;
int msgindex;
@@ -185,6 +204,9 @@ int __rtnl_register(int protocol, int msgtype,
if (dumpit)
tab[msgindex].dumpit = dumpit;
+ if (calcit)
+ tab[msgindex].calcit = calcit;
+
return 0;
}
EXPORT_SYMBOL_GPL(__rtnl_register);
@@ -196,12 +218,13 @@ EXPORT_SYMBOL_GPL(__rtnl_register);
* as failure of this function is very unlikely, it can only happen due
* to lack of memory when allocating the chain to store all message
* handlers for a protocol. Meant for use in init functions where lack
- * of memory implies no sense in continueing.
+ * of memory implies no sense in continuing.
*/
void rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
{
- if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0)
+ if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
panic("Unable to register rtnetlink message handler, "
"protocol = %d, message type = %d\n",
protocol, msgtype);
@@ -250,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);
static LIST_HEAD(link_ops);
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
/**
* __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
@@ -262,6 +296,9 @@ static LIST_HEAD(link_ops);
*/
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
if (!ops->dellink)
ops->dellink = unregister_netdevice_queue;
@@ -316,27 +353,63 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)
}
EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
/**
* rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
- rtnl_lock();
+ /* Close the race with cleanup_net() */
+ mutex_lock(&net_mutex);
+ rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
+ mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
{
+ struct net_device *master_dev;
const struct rtnl_link_ops *ops;
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
- }
- return NULL;
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops || !ops->get_slave_size)
+ return 0;
+ /* IFLA_INFO_SLAVE_DATA + nested data */
+ return nla_total_size(sizeof(struct nlattr)) +
+ ops->get_slave_size(master_dev, dev);
}
static size_t rtnl_link_get_size(const struct net_device *dev)
@@ -359,6 +432,8 @@ static size_t rtnl_link_get_size(const struct net_device *dev)
/* IFLA_INFO_XSTATS */
size += nla_total_size(ops->get_xstats_size(dev));
+ size += rtnl_link_get_slave_info_data_size(dev);
+
return size;
}
@@ -377,34 +452,16 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
}
/**
- * __rtnl_af_register - Register rtnl_af_ops with rtnetlink.
- * @ops: struct rtnl_af_ops * to register
- *
- * The caller must hold the rtnl_mutex.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_af_register(struct rtnl_af_ops *ops)
-{
- list_add_tail(&ops->list, &rtnl_af_ops);
- return 0;
-}
-EXPORT_SYMBOL_GPL(__rtnl_af_register);
-
-/**
* rtnl_af_register - Register rtnl_af_ops with rtnetlink.
* @ops: struct rtnl_af_ops * to register
*
* Returns 0 on success or a negative error code.
*/
-int rtnl_af_register(struct rtnl_af_ops *ops)
+void rtnl_af_register(struct rtnl_af_ops *ops)
{
- int err;
-
rtnl_lock();
- err = __rtnl_af_register(ops);
+ list_add_tail(&ops->list, &rtnl_af_ops);
rtnl_unlock();
- return err;
}
EXPORT_SYMBOL_GPL(rtnl_af_register);
@@ -451,84 +508,107 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev)
return size;
}
-static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+static bool rtnl_have_link_slave_info(const struct net_device *dev)
{
- const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
- struct nlattr *linkinfo, *data;
- int err = -EMSGSIZE;
+ struct net_device *master_dev;
- linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
- if (linkinfo == NULL)
- goto out;
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (master_dev && master_dev->rtnl_link_ops)
+ return true;
+ return false;
+}
+
+static int rtnl_link_slave_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ struct nlattr *slave_data;
+ int err;
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_slave_info) {
+ slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA);
+ if (!slave_data)
+ return -EMSGSIZE;
+ err = ops->fill_slave_info(skb, master_dev, dev);
+ if (err < 0)
+ goto err_cancel_slave_data;
+ nla_nest_end(skb, slave_data);
+ }
+ return 0;
+
+err_cancel_slave_data:
+ nla_nest_cancel(skb, slave_data);
+ return err;
+}
+
+static int rtnl_link_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *data;
+ int err;
+
+ if (!ops)
+ return 0;
if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
- goto err_cancel_link;
+ return -EMSGSIZE;
if (ops->fill_xstats) {
err = ops->fill_xstats(skb, dev);
if (err < 0)
- goto err_cancel_link;
+ return err;
}
if (ops->fill_info) {
data = nla_nest_start(skb, IFLA_INFO_DATA);
if (data == NULL)
- goto err_cancel_link;
+ return -EMSGSIZE;
err = ops->fill_info(skb, dev);
if (err < 0)
goto err_cancel_data;
nla_nest_end(skb, data);
}
-
- nla_nest_end(skb, linkinfo);
return 0;
err_cancel_data:
nla_nest_cancel(skb, data);
-err_cancel_link:
- nla_nest_cancel(skb, linkinfo);
-out:
return err;
}
-static const int rtm_min[RTM_NR_FAMILIES] =
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
{
- [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
- [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
- [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
- [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
- [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)),
- [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
-};
+ struct nlattr *linkinfo;
+ int err = -EMSGSIZE;
-static const int rta_max[RTM_NR_FAMILIES] =
-{
- [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX,
- [RTM_FAM(RTM_NEWADDR)] = IFA_MAX,
- [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX,
- [RTM_FAM(RTM_NEWRULE)] = FRA_MAX,
- [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
- [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
-};
+ linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
-void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
-{
- struct rtattr *rta;
- int size = RTA_LENGTH(attrlen);
+ err = rtnl_link_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ err = rtnl_link_slave_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
- rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size));
- rta->rta_type = attrtype;
- rta->rta_len = size;
- memcpy(RTA_DATA(rta), data, attrlen);
- memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
}
-EXPORT_SYMBOL(__rta_fill);
-int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
struct sock *rtnl = net->rtnl;
int err = 0;
@@ -583,7 +663,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) {
valid++;
- NLA_PUT_U32(skb, i+1, metrics[i]);
+ if (nla_put_u32(skb, i+1, metrics[i]))
+ goto nla_put_failure;
}
}
@@ -601,21 +682,23 @@ nla_put_failure:
EXPORT_SYMBOL(rtnetlink_put_metrics);
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- u32 ts, u32 tsage, long expires, u32 error)
+ long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
+ .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
.rta_used = dst->__use,
.rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
- .rta_ts = ts,
- .rta_tsage = tsage,
};
- if (expires)
- ci.rta_expires = jiffies_to_clock_t(expires);
+ if (expires) {
+ unsigned long clock;
+ clock = jiffies_to_clock_t(abs(expires));
+ clock = min_t(unsigned long, clock, INT_MAX);
+ ci.rta_expires = (expires > 0) ? clock : -clock;
+ }
return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
@@ -647,6 +730,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
}
}
+static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
+{
+ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
+ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
+}
+
static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
const struct ifinfomsg *ifm)
{
@@ -655,7 +744,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
/* bugwards compatibility: ifi_change == 0 is treated as ~0 */
if (ifm->ifi_change)
flags = (flags & ifm->ifi_change) |
- (dev->flags & ~ifm->ifi_change);
+ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
return flags;
}
@@ -698,23 +787,26 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
}
/* All VF info */
-static inline int rtnl_vfinfo_size(const struct net_device *dev)
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
- if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
-
+ if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
+ (ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
size_t size = nla_total_size(sizeof(struct nlattr));
size += nla_total_size(num_vfs * sizeof(struct nlattr));
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
- nla_total_size(sizeof(struct ifla_vf_tx_rate)));
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)));
return size;
} else
return 0;
}
-static size_t rtnl_port_size(const struct net_device *dev)
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
size_t port_size = nla_total_size(4) /* PORT_VF */
+ nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
@@ -730,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ port_size;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
if (dev_num_vf(dev->dev.parent))
return port_self_size + vf_ports_size +
@@ -739,7 +832,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
return port_self_size;
}
-static noinline size_t if_nlmsg_size(const struct net_device *dev)
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
@@ -755,13 +849,20 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)
+ nla_total_size(4) /* IFLA_MTU */
+ nla_total_size(4) /* IFLA_LINK */
+ nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_CARRIER */
+ + nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
- + nla_total_size(4) /* IFLA_NUM_VF */
- + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
- + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ + nla_total_size(ext_filter_mask
+ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
- + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
+ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -779,7 +880,8 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
vf_port = nla_nest_start(skb, IFLA_VF_PORT);
if (!vf_port)
goto nla_put_failure;
- NLA_PUT_U32(skb, IFLA_PORT_VF, vf);
+ if (nla_put_u32(skb, IFLA_PORT_VF, vf))
+ goto nla_put_failure;
err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
if (err == -EMSGSIZE)
goto nla_put_failure;
@@ -819,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
{
int err;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
err = rtnl_port_self_fill(skb, dev);
@@ -839,9 +943,27 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_port_id ppid;
+
+ err = dev_get_phys_port_id(dev, &ppid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
- unsigned int flags)
+ unsigned int flags, u32 ext_filter_mask)
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
@@ -849,7 +971,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
const struct rtnl_link_stats64 *stats;
struct nlattr *attr, *af_spec;
struct rtnl_af_ops *af_ops;
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -862,24 +986,30 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
ifm->ifi_flags = dev_get_flags(dev);
ifm->ifi_change = change;
- NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
- NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len);
- NLA_PUT_U8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
- NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
- NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
-
- if (dev->ifindex != dev->iflink)
- NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
-
- if (dev->master)
- NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex);
-
- if (dev->qdisc)
- NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id);
-
- if (dev->ifalias)
- NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias);
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_GROUP, dev->group) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+#ifdef CONFIG_RPS
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+#endif
+ (dev->ifindex != dev->iflink &&
+ nla_put_u32(skb, IFLA_LINK, dev->iflink)) ||
+ (upper_dev &&
+ nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
+ nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
+ (dev->qdisc &&
+ nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+ (dev->ifalias &&
+ nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_changes)))
+ goto nla_put_failure;
if (1) {
struct rtnl_link_ifmap map = {
@@ -890,14 +1020,19 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
.dma = dev->dma,
.port = dev->if_port,
};
- NLA_PUT(skb, IFLA_MAP, sizeof(map), &map);
+ if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ goto nla_put_failure;
}
if (dev->addr_len) {
- NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
- NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+ if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
+ nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
+ goto nla_put_failure;
}
+ if (rtnl_phys_port_id_fill(skb, dev))
+ goto nla_put_failure;
+
attr = nla_reserve(skb, IFLA_STATS,
sizeof(struct rtnl_link_stats));
if (attr == NULL)
@@ -912,10 +1047,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
goto nla_put_failure;
copy_rtnl_link_stats64(nla_data(attr), stats);
- if (dev->dev.parent)
- NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
+ nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
+ goto nla_put_failure;
- if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
+ && (ext_filter_mask & RTEXT_FILTER_VF)) {
int i;
struct nlattr *vfinfo, *vf;
@@ -928,31 +1065,65 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct ifla_vf_info ivi;
struct ifla_vf_mac vf_mac;
struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_link_state vf_linkstate;
+
+ /*
+ * Not all SR-IOV capable drivers support the
+ * spoofcheck query. Preset to -1 so the user
+ * space tool can detect that the driver didn't
+ * report anything.
+ */
+ ivi.spoofchk = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
break;
- vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf = ivi.vf;
+
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
- vf_tx_rate.rate = ivi.tx_rate;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
vf = nla_nest_start(skb, IFLA_VF_INFO);
if (!vf) {
nla_nest_cancel(skb, vfinfo);
goto nla_put_failure;
}
- NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
- NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
- NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate))
+ goto nla_put_failure;
nla_nest_end(skb, vf);
}
nla_nest_end(skb, vfinfo);
}
- if (rtnl_port_fill(skb, dev))
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
goto nla_put_failure;
- if (dev->rtnl_link_ops) {
+ if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
if (rtnl_link_fill(skb, dev) < 0)
goto nla_put_failure;
}
@@ -994,64 +1165,40 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
- struct hlist_node *node;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, node, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI) <= 0)
- goto out;
-cont:
- idx++;
- }
- }
-out:
- cb->args[1] = idx;
- cb->args[0] = h;
-
- return skb->len;
-}
-
-const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
[IFLA_MTU] = { .type = NLA_U32 },
[IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
+ [IFLA_CARRIER] = { .type = NLA_U8 },
[IFLA_TXQLEN] = { .type = NLA_U32 },
[IFLA_WEIGHT] = { .type = NLA_U32 },
[IFLA_OPERSTATE] = { .type = NLA_U8 },
[IFLA_LINKMODE] = { .type = NLA_U8 },
[IFLA_LINKINFO] = { .type = NLA_NESTED },
[IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_NET_NS_FD] = { .type = NLA_U32 },
[IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
[IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
[IFLA_VF_PORTS] = { .type = NLA_NESTED },
[IFLA_PORT_SELF] = { .type = NLA_NESTED },
[IFLA_AF_SPEC] = { .type = NLA_NESTED },
+ [IFLA_EXT_MASK] = { .type = NLA_U32 },
+ [IFLA_PROMISCUITY] = { .type = NLA_U32 },
+ [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
};
-EXPORT_SYMBOL(ifla_policy);
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
[IFLA_INFO_KIND] = { .type = NLA_STRING },
[IFLA_INFO_DATA] = { .type = NLA_NESTED },
+ [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
};
static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
@@ -1065,6 +1212,12 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
.len = sizeof(struct ifla_vf_vlan) },
[IFLA_VF_TX_RATE] = { .type = NLA_BINARY,
.len = sizeof(struct ifla_vf_tx_rate) },
+ [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_link_state) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1081,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ int err;
+ int hdrlen;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rcu_read_lock();
+ cb->seq = net->dev_base_seq;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI,
+ ext_filter_mask);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err <= 0)
+ goto out;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
+{
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
struct net *net;
@@ -1089,6 +1314,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
*/
if (tb[IFLA_NET_NS_PID])
net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else if (tb[IFLA_NET_NS_FD])
+ net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
else
net = get_net(src_net);
return net;
@@ -1121,8 +1348,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return -EOPNOTSUPP;
if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev,
- tb[IFLA_AF_SPEC]);
+ err = af_ops->validate_link_af(dev, af);
if (err < 0)
return err;
}
@@ -1161,11 +1387,47 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
}
case IFLA_VF_TX_RATE: {
struct ifla_vf_tx_rate *ivt;
+ struct ifla_vf_info ivf;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf,
+ &ivf);
+ if (err)
+ break;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ break;
+ }
+ case IFLA_VF_RATE: {
+ struct ifla_vf_rate *ivt;
ivt = nla_data(vf);
err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_tx_rate)
- err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
- ivt->rate);
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
+ break;
+ }
+ case IFLA_VF_SPOOFCHK: {
+ struct ifla_vf_spoofchk *ivs;
+ ivs = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ break;
+ }
+ case IFLA_VF_LINK_STATE: {
+ struct ifla_vf_link_state *ivl;
+ ivl = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
break;
}
default:
@@ -1178,19 +1440,58 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
return err;
}
-static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops;
+ int err;
+
+ if (upper_dev) {
+ if (upper_dev->ifindex == ifindex)
+ return 0;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!upper_dev)
+ return -EINVAL;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
const struct net_device_ops *ops = dev->netdev_ops;
- int send_addr_notify = 0;
int err;
- if (tb[IFLA_NET_NS_PID]) {
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
struct net *net = rtnl_link_get_net(dev_net(dev), tb);
if (IS_ERR(net)) {
err = PTR_ERR(net);
goto errout;
}
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto errout;
+ }
err = dev_change_net_namespace(dev, net, ifname);
put_net(net);
if (err)
@@ -1231,16 +1532,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
struct sockaddr *sa;
int len;
- if (!ops->ndo_set_mac_address) {
- err = -EOPNOTSUPP;
- goto errout;
- }
-
- if (!netif_device_present(dev)) {
- err = -ENODEV;
- goto errout;
- }
-
len = sizeof(sa_family_t) + dev->addr_len;
sa = kmalloc(len, GFP_KERNEL);
if (!sa) {
@@ -1250,11 +1541,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
sa->sa_family = dev->type;
memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
dev->addr_len);
- err = ops->ndo_set_mac_address(dev, sa);
+ err = dev_set_mac_address(dev, sa);
kfree(sa);
if (err)
goto errout;
- send_addr_notify = 1;
modified = 1;
}
@@ -1265,6 +1555,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
modified = 1;
}
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ modified = 1;
+ }
+
/*
* Interface selected by interface index but interface
* name provided implies that a name change has been
@@ -1287,7 +1582,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
if (tb[IFLA_BROADCAST]) {
nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
- send_addr_notify = 1;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
}
if (ifm->ifi_flags || ifm->ifi_change) {
@@ -1296,6 +1591,20 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
goto errout;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_CARRIER]) {
+ err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
if (tb[IFLA_TXQLEN])
dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
@@ -1390,18 +1699,14 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
err = 0;
errout:
- if (err < 0 && modified && net_ratelimit())
- printk(KERN_WARNING "A link change request failed with "
- "some changes comitted already. Interface %s may "
- "have been left with an inconsistent configuration, "
- "please check.\n", dev->name);
+ if (err < 0 && modified)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
- if (send_addr_notify)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return err;
}
-static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -1437,12 +1742,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
if (err < 0)
goto errout;
- err = do_setlink(dev, ifm, tb, ifname, 0);
+ err = do_setlink(skb, dev, ifm, tb, ifname, 0);
errout:
return err;
}
-static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
@@ -1451,6 +1756,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
int err;
+ LIST_HEAD(list_kill);
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err < 0)
@@ -1474,7 +1780,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
if (!ops)
return -EOPNOTSUPP;
- ops->dellink(dev, NULL);
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
return 0;
}
@@ -1491,48 +1798,47 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
}
dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
- __dev_notify_flags(dev, old_flags);
+ __dev_notify_flags(dev, old_flags, ~0U);
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
-struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
+struct net_device *rtnl_create_link(struct net *net,
char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
int err;
struct net_device *dev;
- unsigned int num_queues = 1;
- unsigned int real_num_queues = 1;
+ unsigned int num_tx_queues = 1;
+ unsigned int num_rx_queues = 1;
+
+ if (tb[IFLA_NUM_TX_QUEUES])
+ num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
+ else if (ops->get_num_tx_queues)
+ num_tx_queues = ops->get_num_tx_queues();
+
+ if (tb[IFLA_NUM_RX_QUEUES])
+ num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
+ else if (ops->get_num_rx_queues)
+ num_rx_queues = ops->get_num_rx_queues();
- if (ops->get_tx_queues) {
- err = ops->get_tx_queues(src_net, tb, &num_queues,
- &real_num_queues);
- if (err)
- goto err;
- }
err = -ENOMEM;
- dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues);
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
+ num_tx_queues, num_rx_queues);
if (!dev)
goto err;
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
- dev->real_num_tx_queues = real_num_queues;
-
- if (strchr(dev->name, '%')) {
- err = dev_alloc_name(dev, dev->name);
- if (err < 0)
- goto err_free;
- }
if (tb[IFLA_MTU])
dev->mtu = nla_get_u32(tb[IFLA_MTU]);
- if (tb[IFLA_ADDRESS])
+ if (tb[IFLA_ADDRESS]) {
memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
nla_len(tb[IFLA_ADDRESS]));
+ dev->addr_assign_type = NET_ADDR_SET;
+ }
if (tb[IFLA_BROADCAST])
memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
nla_len(tb[IFLA_BROADCAST]));
@@ -1542,21 +1848,42 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
return dev;
-err_free:
- free_netdev(dev);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL(rtnl_create_link);
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct nlattr **tb)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ err = do_setlink(skb, dev, ifm, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
+ const struct rtnl_link_ops *m_ops = NULL;
struct net_device *dev;
+ struct net_device *master_dev = NULL;
struct ifinfomsg *ifm;
char kind[MODULE_NAME_LEN];
char ifname[IFNAMSIZ];
@@ -1579,10 +1906,18 @@ replay:
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (ifname[0])
- dev = __dev_get_by_name(net, ifname);
- else
- dev = NULL;
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
+
+ if (dev) {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+ }
err = validate_linkmsg(dev, tb);
if (err < 0)
@@ -1605,7 +1940,10 @@ replay:
}
if (1) {
- struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL;
+ struct nlattr *attr[ops ? ops->maxtype + 1 : 0];
+ struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0];
+ struct nlattr **data = NULL;
+ struct nlattr **slave_data = NULL;
struct net *dest_net;
if (ops) {
@@ -1624,6 +1962,24 @@ replay:
}
}
+ if (m_ops) {
+ if (m_ops->slave_maxtype &&
+ linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ err = nla_parse_nested(slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy);
+ if (err < 0)
+ return err;
+ slave_data = slave_attr;
+ }
+ if (m_ops->slave_validate) {
+ err = m_ops->slave_validate(tb, slave_data);
+ if (err < 0)
+ return err;
+ }
+ }
+
if (dev) {
int modified = 0;
@@ -1643,14 +1999,28 @@ replay:
modified = 1;
}
- return do_setlink(dev, ifm, tb, ifname, modified);
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ err = m_ops->slave_changelink(master_dev, dev,
+ tb, slave_data);
+ if (err < 0)
+ return err;
+ modified = 1;
+ }
+
+ return do_setlink(skb, dev, ifm, tb, ifname, modified);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(skb, net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, tb);
return -ENODEV;
+ }
- if (ifm->ifi_index)
- return -EOPNOTSUPP;
if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
return -EOPNOTSUPP;
@@ -1672,20 +2042,36 @@ replay:
snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
dest_net = rtnl_link_get_net(net, tb);
- dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
- if (IS_ERR(dev))
+ dev = rtnl_create_link(dest_net, ifname, ops, tb);
+ if (IS_ERR(dev)) {
err = PTR_ERR(dev);
- else if (ops->newlink)
- err = ops->newlink(net, dev, tb, data);
- else
- err = register_netdevice(dev);
-
- if (err < 0 && !IS_ERR(dev))
- free_netdev(dev);
- if (err < 0)
goto out;
+ }
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink) {
+ err = ops->newlink(net, dev, tb, data);
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(dev);
+ goto out;
+ }
+ } else {
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+ }
err = rtnl_configure_link(dev, ifm);
if (err < 0)
unregister_netdevice(dev);
@@ -1695,7 +2081,7 @@ out:
}
}
-static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -1704,6 +2090,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
struct net_device *dev = NULL;
struct sk_buff *nskb;
int err;
+ u32 ext_filter_mask = 0;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err < 0)
@@ -1712,6 +2099,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (tb[IFLA_IFNAME])
nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
@@ -1723,22 +2113,55 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (dev == NULL)
return -ENODEV;
- nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+ nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
if (nskb == NULL)
return -ENOBUFS;
- err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
- nlh->nlmsg_seq, 0, 0);
+ err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
kfree_skb(nskb);
} else
- err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
return err;
}
+static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ if (!ext_filter_mask)
+ return NLMSG_GOODSIZE;
+ /*
+ * traverse the list of net devices and compute the minimum
+ * buffer size based upon the filter mask.
+ */
+ list_for_each_entry(dev, &net->dev_base_head, dev_list) {
+ min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
+ if_nlmsg_size(dev,
+ ext_filter_mask));
+ }
+
+ return min_ifinfo_dump_size;
+}
+
static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
@@ -1753,8 +2176,11 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (rtnl_msg_handlers[idx] == NULL ||
rtnl_msg_handlers[idx][type].dumpit == NULL)
continue;
- if (idx > s_idx)
+ if (idx > s_idx) {
memset(&cb->args[0], 0, sizeof(cb->args));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
break;
}
@@ -1763,33 +2189,653 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ size_t if_info_size;
- skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+ skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
if (skb == NULL)
goto errout;
- err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
+ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
}
+EXPORT_SYMBOL(rtmsg_ifinfo);
+
+static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ u8 *addr, u32 pid, u32 seq,
+ int type, unsigned int flags,
+ int nlflags)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = flags;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+}
+
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ u8 *addr;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+ return -EINVAL;
+ }
-/* Protected by RTNL sempahore. */
-static struct rtattr **rta_buf;
-static int rtattr_max;
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr)
+{
+ int err = -EOPNOTSUPP;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (!(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return -EINVAL;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+ else
+ err = -EINVAL;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ int err = -EINVAL;
+ __u8 *addr;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if (ndm->ndm_flags & NTF_SELF) {
+ if (dev->netdev_ops->ndo_fdb_del)
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+static int nlmsg_populate_fdb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int *idx,
+ struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+ u32 portid, seq;
+
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (*idx < cb->args[0])
+ goto skip;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
+ portid, seq,
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI);
+ if (err < 0)
+ return err;
+skip:
+ *idx += 1;
+ }
+ return 0;
+}
+
+/**
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
+ * @nlh: netlink message header
+ * @dev: netdevice
+ *
+ * Default netdevice operation to dump the existing unicast address list.
+ * Returns number of addresses from list put in skb.
+ */
+int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int idx)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
+ if (err)
+ goto out;
+ nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
+out:
+ netif_addr_unlock_bh(dev);
+ return idx;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_dump);
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ struct net_device *br_dev;
+ const struct net_device_ops *ops;
+
+ br_dev = netdev_master_upper_dev_get(dev);
+ ops = br_dev->netdev_ops;
+ if (ops->ndo_fdb_dump)
+ idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
+ else
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct nlattr *br_afspec;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_BRIDGE;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = 0;
+
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (br_dev &&
+ nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev->iflink &&
+ nla_put_u32(skb, IFLA_LINK, dev->iflink)))
+ goto nla_put_failure;
+
+ br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!br_afspec)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
+ nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, br_afspec);
+
+ return nlmsg_end(skb, nlh);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx = 0;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ u32 seq = cb->nlh->nlmsg_seq;
+ struct nlattr *extfilt;
+ u32 filter_mask = 0;
+
+ extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+ IFLA_EXT_MASK);
+ if (extfilt)
+ filter_mask = nla_get_u32(extfilt);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev, filter_mask) < 0)
+ break;
+ idx++;
+ }
+
+ if (ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ ops->ndo_bridge_getlink(skb, portid, seq, dev,
+ filter_mask) < 0)
+ break;
+ idx++;
+ }
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static inline size_t bridge_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_MTU */
+ + nla_total_size(sizeof(u32)) /* IFLA_LINK */
+ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */
+ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */
+ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */
+ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
+{
+ struct net *net = dev_net(dev);
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ struct sk_buff *skb;
+ int err = -EOPNOTSUPP;
+
+ skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) &&
+ br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ if (err < 0)
+ goto errout;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF) &&
+ dev->netdev_ops->ndo_bridge_getlink) {
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ if (err < 0)
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return 0;
+errout:
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return err;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 oflags, flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ oflags = flags;
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_setlink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+
+ if (!err)
+ flags &= ~BRIDGE_FLAGS_SELF;
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+ /* Generate event to notify upper layer of bridge change */
+ if (!err)
+ err = rtnl_bridge_notify(dev, oflags);
+out:
+ return err;
+}
+
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 oflags, flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ oflags = flags;
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_dellink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+
+ if (!err)
+ flags &= ~BRIDGE_FLAGS_SELF;
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+ /* Generate event to notify upper layer of bridge change */
+ if (!err)
+ err = rtnl_bridge_notify(dev, oflags);
+out:
+ return err;
+}
/* Process one rtnetlink message. */
@@ -1798,7 +2844,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net *net = sock_net(skb->sk);
rtnl_doit_func doit;
int sz_idx, kind;
- int min_len;
int family;
int type;
int err;
@@ -1810,57 +2855,47 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
type -= RTM_BASE;
/* All the messages must have at least 1 byte length */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
return 0;
- family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
sz_idx = type>>2;
kind = type&3;
- if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
+ u16 min_dump_alloc = 0;
dumpit = rtnl_get_dumpit(family, type);
if (dumpit == NULL)
return -EOPNOTSUPP;
+ calcit = rtnl_get_calcit(family, type);
+ if (calcit)
+ min_dump_alloc = calcit(skb, nlh);
__rtnl_unlock();
rtnl = net->rtnl;
- err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
+ {
+ struct netlink_dump_control c = {
+ .dump = dumpit,
+ .min_dump_alloc = min_dump_alloc,
+ };
+ err = netlink_dump_start(rtnl, skb, nlh, &c);
+ }
rtnl_lock();
return err;
}
- memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
-
- min_len = rtm_min[sz_idx];
- if (nlh->nlmsg_len < min_len)
- return -EINVAL;
-
- if (nlh->nlmsg_len > min_len) {
- int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
- struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len);
-
- while (RTA_OK(attr, attrlen)) {
- unsigned flavor = attr->rta_type;
- if (flavor) {
- if (flavor > rta_max[sz_idx])
- return -EINVAL;
- rta_buf[flavor-1] = attr;
- }
- attr = RTA_NEXT(attr, attrlen);
- }
- }
-
doit = rtnl_get_doit(family, type);
if (doit == NULL)
return -EOPNOTSUPP;
- return doit(skb, nlh, (void *)&rta_buf[0]);
+ return doit(skb, nlh);
}
static void rtnetlink_rcv(struct sk_buff *skb)
@@ -1872,7 +2907,7 @@ static void rtnetlink_rcv(struct sk_buff *skb)
static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
switch (event) {
case NETDEV_UP:
@@ -1884,10 +2919,12 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
case NETDEV_PRE_TYPE_CHANGE:
case NETDEV_GOING_DOWN:
case NETDEV_UNREGISTER:
- case NETDEV_UNREGISTER_BATCH:
+ case NETDEV_UNREGISTER_FINAL:
+ case NETDEV_RELEASE:
+ case NETDEV_JOIN:
break;
default:
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
break;
}
return NOTIFY_DONE;
@@ -1901,8 +2938,14 @@ static struct notifier_block rtnetlink_dev_notifier = {
static int __net_init rtnetlink_net_init(struct net *net)
{
struct sock *sk;
- sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
- rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+ struct netlink_kernel_cfg cfg = {
+ .groups = RTNLGRP_MAX,
+ .input = rtnetlink_rcv,
+ .cb_mutex = &rtnl_mutex,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
if (!sk)
return -ENOMEM;
net->rtnl = sk;
@@ -1922,28 +2965,26 @@ static struct pernet_operations rtnetlink_net_ops = {
void __init rtnetlink_init(void)
{
- int i;
-
- rtattr_max = 0;
- for (i = 0; i < ARRAY_SIZE(rta_max); i++)
- if (rta_max[i] > rtattr_max)
- rtattr_max = rta_max[i];
- rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
- if (!rta_buf)
- panic("rtnetlink_init: cannot allocate rta_buf\n");
-
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
- netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo);
- rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL);
- rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
+ rtnl_dump_ifinfo, rtnl_calcit);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
+
+ rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all);
- rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
+ rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index bbe45445080..b442e7e25e6 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -24,11 +24,11 @@
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
+#include <linux/pid_namespace.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/protocol.h>
@@ -36,6 +36,7 @@
#include <net/sock.h>
#include <net/compat.h>
#include <net/scm.h>
+#include <net/cls_cgroup.h>
/*
@@ -46,12 +47,18 @@
static __inline__ int scm_check_creds(struct ucred *creds)
{
const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
- if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
- ((creds->uid == cred->uid || creds->uid == cred->euid ||
- creds->uid == cred->suid) || capable(CAP_SETUID)) &&
- ((creds->gid == cred->gid || creds->gid == cred->egid ||
- creds->gid == cred->sgid) || capable(CAP_SETGID))) {
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
return 0;
}
return -EPERM;
@@ -95,7 +102,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
int fd = fdp[i];
struct file *file;
- if (fd < 0 || !(file = fget(fd)))
+ if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
*fpp++ = file;
fpl->count++;
@@ -110,25 +117,9 @@ void __scm_destroy(struct scm_cookie *scm)
if (fpl) {
scm->fp = NULL;
- if (current->scm_work_list) {
- list_add_tail(&fpl->list, current->scm_work_list);
- } else {
- LIST_HEAD(work_list);
-
- current->scm_work_list = &work_list;
-
- list_add(&fpl->list, &work_list);
- while (!list_empty(&work_list)) {
- fpl = list_first_entry(&work_list, struct scm_fp_list, list);
-
- list_del(&fpl->list);
- for (i=fpl->count-1; i>=0; i--)
- fput(fpl->fp[i]);
- kfree(fpl);
- }
-
- current->scm_work_list = NULL;
- }
+ for (i=fpl->count-1; i>=0; i--)
+ fput(fpl->fp[i]);
+ kfree(fpl);
}
}
EXPORT_SYMBOL(__scm_destroy);
@@ -166,37 +157,38 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
goto error;
break;
case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
- memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
- err = scm_check_creds(&p->creds);
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
if (err)
goto error;
- if (pid_vnr(p->pid) != p->creds.pid) {
+ p->creds.pid = creds.pid;
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
struct pid *pid;
err = -ESRCH;
- pid = find_get_pid(p->creds.pid);
+ pid = find_get_pid(creds.pid);
if (!pid)
goto error;
put_pid(p->pid);
p->pid = pid;
}
- if ((p->cred->euid != p->creds.uid) ||
- (p->cred->egid != p->creds.gid)) {
- struct cred *cred;
- err = -ENOMEM;
- cred = prepare_creds();
- if (!cred)
- goto error;
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
- cred->uid = cred->euid = p->creds.uid;
- cred->gid = cred->egid = p->creds.uid;
- put_cred(p->cred);
- p->cred = cred;
- }
+ p->creds.uid = uid;
+ p->creds.gid = gid;
break;
+ }
default:
goto error;
}
@@ -280,6 +272,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
i++, cmfptr++)
{
+ struct socket *sock;
int new_fd;
err = security_file_receive(fp[i]);
if (err)
@@ -295,8 +288,12 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
break;
}
/* Bump the usage count and install the file. */
- get_file(fp[i]);
- fd_install(new_fd, fp[i]);
+ sock = sock_from_file(fp[i], &err);
+ if (sock) {
+ sock_update_netprioidx(sock->sk);
+ sock_update_classid(sock->sk);
+ }
+ fd_install(new_fd, get_file(fp[i]));
}
if (i > 0)
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 00000000000..ba71212f025
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,173 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cryptohash.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/net.h>
+
+#include <net/secure_seq.h>
+
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
+
+static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+
+static __always_inline void net_secret_init(void)
+{
+ net_get_random_once(net_secret, sizeof(net_secret));
+}
+#endif
+
+#ifdef CONFIG_INET
+static u32 seq_scale(u32 seq)
+{
+ /*
+ * As close as possible to RFC 793, which
+ * suggests using a 250 kHz clock.
+ * Further reading shows this assumes 2 Mb/s networks.
+ * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+ * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+ * we also need to limit the resolution so that the u32 seq
+ * overlaps less than one time per MSL (2 minutes).
+ * Choosing a clock of 64 ns period is OK. (period of 274 s)
+ */
+ return seq + (ktime_to_ns(ktime_get_real()) >> 6);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32)daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return seq_scale(hash[0]);
+}
+EXPORT_SYMBOL(secure_tcpv6_sequence_number);
+
+u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+ __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32) daddr[i];
+ secret[4] = net_secret[4] + (__force u32)dport;
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
+#endif
+
+#ifdef CONFIG_INET
+
+__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return seq_scale(hash[0]);
+}
+
+u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = (__force u32)dport ^ net_secret[14];
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
+
+#if IS_ENABLED(CONFIG_IP_DCCP)
+u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_to_ns(ktime_get_real());
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+
+#if IS_ENABLED(CONFIG_IPV6)
+u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_to_ns(ktime_get_real());
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccpv6_sequence_number);
+#endif
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8814a9a52f4..c1a33033cbe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -36,6 +36,8 @@
* The functions in this file will not compile correctly with gcc 2.4.x
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -45,6 +47,8 @@
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
@@ -57,93 +61,89 @@
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
+#include <linux/prefetch.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <trace/events/skb.h>
+#include <linux/highmem.h>
-#include "kmap_skb.h"
-
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
-static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+/**
+ * skb_panic - private function for out-of-line support
+ * @skb: buffer
+ * @sz: size
+ * @addr: address
+ * @msg: skb_over_panic or skb_under_panic
+ *
+ * Out-of-line support for skb_put() and skb_push().
+ * Called via the wrapper skb_over_panic() or skb_under_panic().
+ * Keep out of line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always reliable.
+ */
+static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
+ const char msg[])
{
- put_page(buf->page);
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ msg, addr, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
+ BUG();
}
-static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
- get_page(buf->page);
+ skb_panic(skb, sz, addr, __func__);
}
-static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
- return 1;
+ skb_panic(skb, sz, addr, __func__);
}
-
-/* Pipe buffer operations for a socket. */
-static const struct pipe_buf_operations sock_pipe_buf_ops = {
- .can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
- .confirm = generic_pipe_buf_confirm,
- .release = sock_pipe_buf_release,
- .steal = sock_pipe_buf_steal,
- .get = sock_pipe_buf_get,
-};
-
/*
- * Keep out-of-line to prevent kernel bloat.
- * __builtin_return_address is not used because it is not always
- * reliable.
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
*/
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
-/**
- * skb_over_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
- *
- * Out of line support code for skb_put(). Not user callable.
- */
-static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
+ unsigned long ip, bool *pfmemalloc)
{
- printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%#lx end:%#lx dev:%s\n",
- here, skb->len, sz, skb->head, skb->data,
- (unsigned long)skb->tail, (unsigned long)skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
- BUG();
-}
+ void *obj;
+ bool ret_pfmemalloc = false;
-/**
- * skb_under_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
- *
- * Out of line support code for skb_push(). Not user callable.
- */
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
-static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
-{
- printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%#lx end:%#lx dev:%s\n",
- here, skb->len, sz, skb->head, skb->data,
- (unsigned long)skb->tail, (unsigned long)skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
- BUG();
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
}
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
@@ -152,30 +152,62 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
*
*/
+struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
+{
+ struct sk_buff *skb;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->head = NULL;
+ skb->truesize = sizeof(struct sk_buff);
+ atomic_set(&skb->users, 1);
+
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+out:
+ return skb;
+}
+
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int flags, int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
- cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -183,11 +215,21 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
goto out;
prefetchw(skb);
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
if (!data)
goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
prefetchw(data + size);
/*
@@ -196,22 +238,24 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->truesize = size + sizeof(struct sk_buff);
+ /* Account for allocated memory : skb + skb->head */
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
skb->end = skb->tail + size;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->mac_header = ~0U;
-#endif
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
- if (fclone) {
+ if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -221,6 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
+ child->pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -232,6 +277,124 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
+ * build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc() only if
+ * @frag_size is 0, otherwise data should come from the page allocator.
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ unsigned int size = frag_size ? : ksize(data);
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+struct netdev_alloc_cache {
+ struct page_frag frag;
+ /* we maintain a pagecount bias, so that we dont dirty cache line
+ * containing page->_count every time we allocate a fragment.
+ */
+ unsigned int pagecnt_bias;
+};
+static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct netdev_alloc_cache *nc;
+ void *data = NULL;
+ int order;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ nc = &__get_cpu_var(netdev_alloc_cache);
+ if (unlikely(!nc->frag.page)) {
+refill:
+ for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
+ gfp_t gfp = gfp_mask;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN;
+ nc->frag.page = alloc_pages(gfp, order);
+ if (likely(nc->frag.page))
+ break;
+ if (--order < 0)
+ goto end;
+ }
+ nc->frag.size = PAGE_SIZE << order;
+recycle:
+ atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
+ nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+ nc->frag.offset = 0;
+ }
+
+ if (nc->frag.offset + fragsz > nc->frag.size) {
+ /* avoid unnecessary locked operations if possible */
+ if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
+ atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
+ goto recycle;
+ goto refill;
+ }
+
+ data = page_address(nc->frag.page) + nc->frag.offset;
+ nc->frag.offset += fragsz;
+ nc->pagecnt_bias--;
+end:
+ local_irq_restore(flags);
+ return data;
+}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(netdev_alloc_frag);
+
+/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
* @length: length to allocate
@@ -245,11 +408,29 @@ EXPORT_SYMBOL(__alloc_skb);
* %NULL is returned if there is no free memory.
*/
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
- unsigned int length, gfp_t gfp_mask)
+ unsigned int length, gfp_t gfp_mask)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+ if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ void *data;
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = __netdev_alloc_frag(fragsz, gfp_mask);
+
+ if (likely(data)) {
+ skb = build_skb(data, fragsz);
+ if (unlikely(!skb))
+ put_page(virt_to_head_page(data));
+ }
+ } else {
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
+ }
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -259,48 +440,31 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
EXPORT_SYMBOL(__netdev_alloc_skb);
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
- int size)
+ int size, unsigned int truesize)
{
skb_fill_page_desc(skb, i, page, off, size);
skb->len += size;
skb->data_len += size;
- skb->truesize += size;
+ skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag);
-/**
- * dev_alloc_skb - allocate an skbuff for receiving
- * @length: length to allocate
- *
- * Allocate a new &sk_buff and assign it a usage count of one. The
- * buffer has unspecified headroom built in. Users should allocate
- * the headroom they think they need without accounting for the
- * built in space. The built in space is used for optimisations.
- *
- * %NULL is returned if there is no free memory. Although this function
- * allocates memory it can be called from an interrupt.
- */
-struct sk_buff *dev_alloc_skb(unsigned int length)
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
{
- /*
- * There is more code here than it seems:
- * __dev_alloc_skb is an inline
- */
- return __dev_alloc_skb(length, GFP_ATOMIC);
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
}
-EXPORT_SYMBOL(dev_alloc_skb);
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
static void skb_drop_list(struct sk_buff **listp)
{
- struct sk_buff *list = *listp;
-
+ kfree_skb_list(*listp);
*listp = NULL;
-
- do {
- struct sk_buff *this = list;
- list = list->next;
- kfree_skb(this);
- } while (list);
}
static inline void skb_drop_fraglist(struct sk_buff *skb)
@@ -316,6 +480,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
@@ -324,13 +496,25 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_shinfo(skb)->nr_frags) {
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
+ }
+
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = skb_shinfo(skb)->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
}
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
- kfree(skb->head);
+ skb_free_head(skb);
}
}
@@ -378,9 +562,8 @@ static void skb_release_head_state(struct sk_buff *skb)
WARN_ON(in_irq());
skb->destructor(skb);
}
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
- nf_conntrack_put_reasm(skb->nfct_reasm);
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(skb->nf_bridge);
@@ -398,7 +581,8 @@ static void skb_release_head_state(struct sk_buff *skb)
static void skb_release_all(struct sk_buff *skb)
{
skb_release_head_state(skb);
- skb_release_data(skb);
+ if (likely(skb->head))
+ skb_release_data(skb);
}
/**
@@ -437,6 +621,37 @@ void kfree_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(kfree_skb);
+void kfree_skb_list(struct sk_buff *segs)
+{
+ while (segs) {
+ struct sk_buff *next = segs->next;
+
+ kfree_skb(segs);
+ segs = next;
+ }
+}
+EXPORT_SYMBOL(kfree_skb_list);
+
+/**
+ * skb_tx_error - report an sk_buff xmit error
+ * @skb: buffer that triggered an error
+ *
+ * Report xmit error if a device callback is tracking this skb.
+ * skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = skb_shinfo(skb)->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, false);
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ }
+}
+EXPORT_SYMBOL(skb_tx_error);
+
/**
* consume_skb - free an skbuff
* @skb: buffer to free
@@ -458,49 +673,6 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
-/**
- * skb_recycle_check - check if skb can be reused for receive
- * @skb: buffer
- * @skb_size: minimum receive buffer size
- *
- * Checks that the skb passed in is not shared or cloned, and
- * that it is linear and its head portion at least as large as
- * skb_size so that it can be recycled as a receive buffer.
- * If these conditions are met, this function does any necessary
- * reference count dropping and cleans up the skbuff as if it
- * just came from __alloc_skb().
- */
-bool skb_recycle_check(struct sk_buff *skb, int skb_size)
-{
- struct skb_shared_info *shinfo;
-
- if (irqs_disabled())
- return false;
-
- if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
- return false;
-
- skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
- if (skb_end_pointer(skb) - skb->head < skb_size)
- return false;
-
- if (skb_shared(skb) || skb_cloned(skb))
- return false;
-
- skb_release_head_state(skb);
-
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
-
- memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->data = skb->head + NET_SKB_PAD;
- skb_reset_tail_pointer(skb);
-
- return true;
-}
-EXPORT_SYMBOL(skb_recycle_check);
-
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
new->tstamp = old->tstamp;
@@ -508,39 +680,50 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->transport_header = old->transport_header;
new->network_header = old->network_header;
new->mac_header = old->mac_header;
+ new->inner_protocol = old->inner_protocol;
+ new->inner_transport_header = old->inner_transport_header;
+ new->inner_network_header = old->inner_network_header;
+ new->inner_mac_header = old->inner_mac_header;
skb_dst_copy(new, old);
- new->rxhash = old->rxhash;
+ skb_copy_hash(new, old);
+ new->ooo_okay = old->ooo_okay;
+ new->no_fcs = old->no_fcs;
+ new->encapsulation = old->encapsulation;
+ new->encap_hdr_csum = old->encap_hdr_csum;
+ new->csum_valid = old->csum_valid;
+ new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
memcpy(new->cb, old->cb, sizeof(old->cb));
new->csum = old->csum;
- new->local_df = old->local_df;
+ new->ignore_df = old->ignore_df;
new->pkt_type = old->pkt_type;
new->ip_summed = old->ip_summed;
skb_copy_queue_mapping(new, old);
new->priority = old->priority;
- new->deliver_no_wcard = old->deliver_no_wcard;
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
new->ipvs_property = old->ipvs_property;
#endif
+ new->pfmemalloc = old->pfmemalloc;
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
__nf_copy(new, old);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
- new->nf_trace = old->nf_trace;
-#endif
#ifdef CONFIG_NET_SCHED
new->tc_index = old->tc_index;
#ifdef CONFIG_NET_CLS_ACT
new->tc_verd = old->tc_verd;
#endif
#endif
+ new->vlan_proto = old->vlan_proto;
new->vlan_tci = old->vlan_tci;
skb_copy_secmark(new, old);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ new->napi_id = old->napi_id;
+#endif
}
/*
@@ -565,6 +748,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(tail);
C(end);
C(head);
+ C(head_frag);
C(data);
C(truesize);
atomic_set(&n->users, 1);
@@ -594,6 +778,67 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
EXPORT_SYMBOL_GPL(skb_morph);
/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
+ * @skb: the skb to modify
+ * @gfp_mask: allocation priority
+ *
+ * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * It will copy all frags into kernel and drop the reference
+ * to userspace pages.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ *
+ * Returns 0 on success or a negative error code on failure
+ * to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int i;
+ int num_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page, *head = NULL;
+ struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+
+ for (i = 0; i < num_frags; i++) {
+ u8 *vaddr;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+ page = alloc_page(gfp_mask);
+ if (!page) {
+ while (head) {
+ struct page *next = (struct page *)page_private(head);
+ put_page(head);
+ head = next;
+ }
+ return -ENOMEM;
+ }
+ vaddr = kmap_atomic(skb_frag_page(f));
+ memcpy(page_address(page),
+ vaddr + f->page_offset, skb_frag_size(f));
+ kunmap_atomic(vaddr);
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ /* skb frags release userspace buffers */
+ for (i = 0; i < num_frags; i++)
+ skb_frag_unref(skb, i);
+
+ uarg->callback(uarg, false);
+
+ /* skb frags point to kernel buffers */
+ for (i = num_frags - 1; i >= 0; i--) {
+ __skb_fill_page_desc(skb, i, head, 0,
+ skb_shinfo(skb)->frags[i].size);
+ head = (struct page *)page_private(head);
+ }
+
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
+
+/**
* skb_clone - duplicate an sk_buff
* @skb: buffer to clone
* @gfp_mask: allocation priority
@@ -611,6 +856,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *n;
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
+
n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
n->fclone == SKB_FCLONE_UNAVAILABLE) {
@@ -618,6 +866,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
} else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
@@ -631,29 +882,37 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
}
EXPORT_SYMBOL(skb_clone);
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+static void skb_headers_offset_update(struct sk_buff *skb, int off)
{
-#ifndef NET_SKBUFF_DATA_USES_OFFSET
- /*
- * Shift between the two data areas in bytes
- */
- unsigned long offset = new->data - old->data;
-#endif
+ /* Only adjust this if it actually is csum_start rather than csum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += off;
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
+}
+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
__copy_skb_header(new, old);
-#ifndef NET_SKBUFF_DATA_USES_OFFSET
- /* {transport,network,mac}_header are relative to skb->head */
- new->transport_header += offset;
- new->network_header += offset;
- if (skb_mac_header_was_set(new))
- new->mac_header += offset;
-#endif
skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
@@ -674,8 +933,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
- unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -694,9 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
@@ -706,16 +970,18 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
{
- unsigned int size = skb_end_pointer(skb) - skb->head;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ unsigned int size = skb_headlen(skb) + headroom;
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb_headroom(skb));
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
@@ -728,9 +994,14 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shinfo(skb)->nr_frags) {
int i;
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
+ }
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
- get_page(skb_shinfo(n)->frags[i].page);
+ skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
}
@@ -744,7 +1015,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(pskb_copy);
+EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -753,8 +1024,8 @@ EXPORT_SYMBOL(pskb_copy);
* @ntail: room to add at tail
* @gfp_mask: allocation priority
*
- * Expands (or creates identical copy, if &nhead and &ntail are zero)
- * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * Expands (or creates identical copy, if @nhead and @ntail are zero)
+ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
* reference count of 1. Returns zero in the case of success or error,
* if expansion failed. In the last case, &sk_buff is not changed.
*
@@ -767,9 +1038,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
{
int i;
u8 *data;
- int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
+ int size = nhead + skb_end_offset(skb) + ntail;
long off;
- bool fastpath;
BUG_ON(nhead < 0);
@@ -778,31 +1048,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
size = SKB_DATA_ALIGN(size);
- /* Check if we can avoid taking references on fragments if we own
- * the last reference on skb->head. (see skb_release_data())
- */
- if (!skb->cloned)
- fastpath = true;
- else {
- int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
-
- fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
- }
-
- if (fastpath &&
- size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
- memmove(skb->head + size, skb_shinfo(skb),
- offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
- memmove(skb->head + nhead, skb->head,
- skb_tail_pointer(skb) - skb->head);
- off = nhead;
- goto adjust_others;
- }
-
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
+ size = SKB_WITH_OVERHEAD(ksize(data));
/* Copy only real data... and, alas, header. This should be
* optimized for the cases when header is void.
@@ -813,21 +1065,29 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb_shinfo(skb),
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
- if (fastpath) {
- kfree(skb->head);
- } else {
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
+ /* copy this zero copy skb frags */
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- get_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
skb->head = data;
-adjust_others:
+ skb->head_frag = 0;
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->end = size;
@@ -835,21 +1095,16 @@ adjust_others:
#else
skb->end = skb->head + size;
#endif
- /* {transport,network,mac}_header and tail are relative to skb->head */
skb->tail += off;
- skb->transport_header += off;
- skb->network_header += off;
- if (skb_mac_header_was_set(skb))
- skb->mac_header += off;
- /* Only adjust this if it actually is csum_start rather than csum */
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- skb->csum_start += nhead;
+ skb_headers_offset_update(skb, nhead);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
+nofrags:
+ kfree(data);
nodata:
return -ENOMEM;
}
@@ -901,11 +1156,11 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
- int off;
if (!n)
return NULL;
@@ -929,15 +1184,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
copy_skb_header(n, skb);
- off = newheadroom - oldheadroom;
- if (n->ip_summed == CHECKSUM_PARTIAL)
- n->csum_start += off;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- n->transport_header += off;
- n->network_header += off;
- if (skb_mac_header_was_set(skb))
- n->mac_header += off;
-#endif
+ skb_headers_offset_update(n, newheadroom - oldheadroom);
return n;
}
@@ -990,6 +1237,29 @@ free_skb:
EXPORT_SYMBOL(skb_pad);
/**
+ * pskb_put - add data to the tail of a potentially fragmented buffer
+ * @skb: start of the buffer to use
+ * @tail: tail fragment of the buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the potentially
+ * fragmented buffer. @tail must be the last fragment of @skb -- or
+ * @skb itself. If this would exceed the total buffer size the kernel
+ * will panic. A pointer to the first byte of the extra data is
+ * returned.
+ */
+
+unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+ if (tail != skb) {
+ skb->data_len += len;
+ skb->len += len;
+ }
+ return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+
+/**
* skb_put - add data to a buffer
* @skb: buffer to use
* @len: amount of data to add
@@ -1082,20 +1352,20 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len)
goto drop_pages;
for (; i < nfrags; i++) {
- int end = offset + skb_shinfo(skb)->frags[i].size;
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (end < len) {
offset = end;
continue;
}
- skb_shinfo(skb)->frags[i++].size = len - offset;
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
drop_pages:
skb_shinfo(skb)->nr_frags = i;
for (; i < nfrags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
@@ -1114,7 +1384,7 @@ drop_pages:
return -ENOMEM;
nfrag->next = frag->next;
- kfree_skb(frag);
+ consume_skb(frag);
frag = nfrag;
*fragp = frag;
}
@@ -1198,9 +1468,11 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Estimate size of pulled pages. */
eat = delta;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size >= eat)
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
goto pull_pages;
- eat -= skb_shinfo(skb)->frags[i].size;
+ eat -= size;
}
/* If we need update frag list, we are in troubles.
@@ -1263,14 +1535,16 @@ pull_pages:
eat = delta;
k = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
if (eat) {
skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
eat = 0;
}
k++;
@@ -1285,8 +1559,21 @@ pull_pages:
}
EXPORT_SYMBOL(__pskb_pull_tail);
-/* Copy some data bits from skb to kernel buffer. */
-
+/**
+ * skb_copy_bits - copy bits from skb to kernel buffer
+ * @skb: source skb
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source skb to the
+ * destination buffer.
+ *
+ * CAUTION ! :
+ * If its prototype is ever changed,
+ * check arch/{*}/net/{*}.S files,
+ * since it is called from BPF assembly code.
+ */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
int start = skb_headlen(skb);
@@ -1309,21 +1596,22 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(to,
- vaddr + skb_shinfo(skb)->frags[i].page_offset+
- offset - start, copy);
- kunmap_skb_frag(vaddr);
+ vaddr + f->page_offset + offset - start,
+ copy);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1351,6 +1639,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
}
start = end;
}
+
if (!len)
return 0;
@@ -1368,140 +1657,122 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
put_page(spd->pages[i]);
}
-static inline struct page *linear_to_page(struct page *page, unsigned int *len,
- unsigned int *offset,
- struct sk_buff *skb, struct sock *sk)
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sock *sk)
{
- struct page *p = sk->sk_sndmsg_page;
- unsigned int off;
-
- if (!p) {
-new_page:
- p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
- if (!p)
- return NULL;
+ struct page_frag *pfrag = sk_page_frag(sk);
- off = sk->sk_sndmsg_off = 0;
- /* hold one ref to this page until it's full */
- } else {
- unsigned int mlen;
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
- off = sk->sk_sndmsg_off;
- mlen = PAGE_SIZE - off;
- if (mlen < 64 && mlen < *len) {
- put_page(p);
- goto new_page;
- }
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
- *len = min_t(unsigned int, *len, mlen);
- }
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
- memcpy(page_address(p) + off, page_address(page) + *offset, *len);
- sk->sk_sndmsg_off += *len;
- *offset = off;
- get_page(p);
+ return pfrag->page;
+}
- return p;
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
}
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static inline int spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- struct sk_buff *skb, int linear,
- struct sock *sk)
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ bool linear,
+ struct sock *sk)
{
- if (unlikely(spd->nr_pages == pipe->buffers))
- return 1;
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
if (linear) {
- page = linear_to_page(page, len, &offset, skb, sk);
+ page = linear_to_page(page, len, &offset, sk);
if (!page)
- return 1;
- } else
- get_page(page);
-
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
spd->pages[spd->nr_pages] = page;
spd->partial[spd->nr_pages].len = *len;
spd->partial[spd->nr_pages].offset = offset;
spd->nr_pages++;
- return 0;
+ return false;
}
-static inline void __segment_seek(struct page **page, unsigned int *poff,
- unsigned int *plen, unsigned int off)
-{
- unsigned long n;
-
- *poff += off;
- n = *poff / PAGE_SIZE;
- if (n)
- *page = nth_page(*page, n);
-
- *poff = *poff % PAGE_SIZE;
- *plen -= off;
-}
-
-static inline int __splice_segment(struct page *page, unsigned int poff,
- unsigned int plen, unsigned int *off,
- unsigned int *len, struct sk_buff *skb,
- struct splice_pipe_desc *spd, int linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
{
if (!*len)
- return 1;
+ return true;
/* skip this segment if already processed */
if (*off >= plen) {
*off -= plen;
- return 0;
+ return false;
}
/* ignore any bits we already processed */
- if (*off) {
- __segment_seek(&page, &poff, &plen, *off);
- *off = 0;
- }
+ poff += *off;
+ plen -= *off;
+ *off = 0;
do {
unsigned int flen = min(*len, plen);
- /* the linear region may spread across several pages */
- flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
-
- if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
- return 1;
-
- __segment_seek(&page, &poff, &plen, flen);
+ if (spd_fill_page(spd, pipe, page, &flen, poff,
+ linear, sk))
+ return true;
+ poff += flen;
+ plen -= flen;
*len -= flen;
-
} while (*len && plen);
- return 0;
+ return false;
}
/*
- * Map linear and fragment data from the skb to spd. It reports failure if the
+ * Map linear and fragment data from the skb to spd. It reports true if the
* pipe is full or if we already spliced the requested length.
*/
-static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
- unsigned int *offset, unsigned int *len,
- struct splice_pipe_desc *spd, struct sock *sk)
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
- /*
- * map the linear part
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
*/
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, skb, spd, 1, sk, pipe))
- return 1;
+ offset, len, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
/*
* then map the fragments
@@ -1509,12 +1780,13 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
- if (__splice_segment(f->page, f->page_offset, f->size,
- offset, len, skb, spd, 0, sk, pipe))
- return 1;
+ if (__splice_segment(skb_frag_page(f),
+ f->page_offset, skb_frag_size(f),
+ offset, len, spd, false, sk, pipe))
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -1527,22 +1799,20 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
.flags = flags,
- .ops = &sock_pipe_buf_ops,
+ .ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
struct sk_buff *frag_iter;
struct sock *sk = skb->sk;
int ret = 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
/*
* __skb_splice_bits() only fails if the output has no room left,
* so no point in going over the frag_list for the error case.
@@ -1578,7 +1848,6 @@ done:
lock_sock(sk);
}
- splice_shrink_spd(pipe, &spd);
return ret;
}
@@ -1619,17 +1888,17 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
WARN_ON(start > offset + len);
- end = start + frag->size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
memcpy(vaddr + frag->page_offset + offset - start,
from, copy);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1667,9 +1936,8 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
+__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
+ __wsum csum, const struct skb_checksum_ops *ops)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -1680,7 +1948,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = csum_partial(skb->data + offset, copy, csum);
+ csum = ops->update(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1689,22 +1957,22 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- csum2 = csum_partial(vaddr + frag->page_offset +
- offset - start, copy, 0);
- kunmap_skb_frag(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ csum2 = ops->update(vaddr + frag->page_offset +
+ offset - start, copy, 0);
+ kunmap_atomic(vaddr);
+ csum = ops->combine(csum, csum2, pos, copy);
if (!(len -= copy))
return csum;
offset += copy;
@@ -1723,9 +1991,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = skb_checksum(frag_iter, offset - start,
- copy, 0);
- csum = csum_block_add(csum, csum2, pos);
+ csum2 = __skb_checksum(frag_iter, offset - start,
+ copy, 0, ops);
+ csum = ops->combine(csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1737,6 +2005,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
return csum;
}
+EXPORT_SYMBOL(__skb_checksum);
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ const struct skb_checksum_ops ops = {
+ .update = csum_partial_ext,
+ .combine = csum_block_add_ext,
+ };
+
+ return __skb_checksum(skb, offset, len, csum, &ops);
+}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -1767,7 +2047,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
@@ -1775,12 +2055,12 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial_copy_nocheck(vaddr +
frag->page_offset +
offset - start, to,
copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
@@ -1818,13 +2098,111 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(from);
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
+ */
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ int ret;
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
+
+ if (hlen) {
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
__wsum csum;
long csstart;
if (skb->ip_summed == CHECKSUM_PARTIAL)
- csstart = skb->csum_start - skb_headroom(skb);
+ csstart = skb_checksum_start_offset(skb);
else
csstart = skb_headlen(skb);
@@ -2040,7 +2418,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
skb->data_len = len - pos;
for (i = 0; i < nfrags; i++) {
- int size = skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) {
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
@@ -2054,10 +2432,10 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- get_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_ref(skb, i);
skb_shinfo(skb1)->frags[0].page_offset += len - pos;
- skb_shinfo(skb1)->frags[0].size -= len - pos;
- skb_shinfo(skb)->frags[i].size = len - pos;
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++;
@@ -2078,6 +2456,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
@@ -2101,7 +2480,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
* @shiftlen: shift up to this many bytes
*
* Attempts to shift up to shiftlen worth of bytes, which may be less than
- * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
* It's up to caller to free skb if everything was shifted.
*
* If @tgt runs out of frags, the whole operation is aborted.
@@ -2129,12 +2508,13 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
* commit all, so that we don't have to undo partial changes
*/
if (!to ||
- !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ fragfrom->page_offset)) {
merge = -1;
} else {
merge = to - 1;
- todo -= fragfrom->size;
+ todo -= skb_frag_size(fragfrom);
if (todo < 0) {
if (skb_prepare_for_shift(skb) ||
skb_prepare_for_shift(tgt))
@@ -2144,8 +2524,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[merge];
- fragto->size += shiftlen;
- fragfrom->size -= shiftlen;
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
fragfrom->page_offset += shiftlen;
goto onlymerged;
@@ -2169,20 +2549,20 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[to];
- if (todo >= fragfrom->size) {
+ if (todo >= skb_frag_size(fragfrom)) {
*fragto = *fragfrom;
- todo -= fragfrom->size;
+ todo -= skb_frag_size(fragfrom);
from++;
to++;
} else {
- get_page(fragfrom->page);
+ __skb_frag_ref(fragfrom);
fragto->page = fragfrom->page;
fragto->page_offset = fragfrom->page_offset;
- fragto->size = todo;
+ skb_frag_size_set(fragto, todo);
fragfrom->page_offset += todo;
- fragfrom->size -= todo;
+ skb_frag_size_sub(fragfrom, todo);
todo = 0;
to++;
@@ -2197,8 +2577,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[0];
fragto = &skb_shinfo(tgt)->frags[merge];
- fragto->size += fragfrom->size;
- put_page(fragfrom->page);
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom);
}
/* Reposition in the original skb */
@@ -2254,18 +2634,18 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
* @data: destination pointer for data to be returned
* @st: state variable
*
- * Reads a block of skb data at &consumed relative to the
+ * Reads a block of skb data at @consumed relative to the
* lower offset specified to skb_prepare_seq_read(). Assigns
- * the head of the data block to &data and returns the length
+ * the head of the data block to @data and returns the length
* of the block or 0 if the end of the skb data or the upper
* offset has been reached.
*
* The caller is not required to consume all of the data
- * returned, i.e. &consumed is typically set to the number
+ * returned, i.e. @consumed is typically set to the number
* of bytes already consumed and the next call to
* skb_seq_read() will return the remaining part of the block.
*
- * Note 1: The size of each block of data returned can be arbitary,
+ * Note 1: The size of each block of data returned can be arbitrary,
* this limitation is the cost for zerocopy seqeuental
* reads of potentially non linear data.
*
@@ -2279,8 +2659,13 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
unsigned int block_limit, abs_offset = consumed + st->lower_offset;
skb_frag_t *frag;
- if (unlikely(abs_offset >= st->upper_offset))
+ if (unlikely(abs_offset >= st->upper_offset)) {
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
return 0;
+ }
next_skb:
block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
@@ -2295,11 +2680,11 @@ next_skb:
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = frag->size + st->stepped_offset;
+ block_limit = skb_frag_size(frag) + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_skb_frag(frag);
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
*data = (u8 *) st->frag_data + frag->page_offset +
(abs_offset - st->stepped_offset);
@@ -2308,16 +2693,16 @@ next_skb:
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
st->frag_idx++;
- st->stepped_offset += frag->size;
+ st->stepped_offset += skb_frag_size(frag);
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
@@ -2345,7 +2730,7 @@ EXPORT_SYMBOL(skb_seq_read);
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);
@@ -2393,7 +2778,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
EXPORT_SYMBOL(skb_find_text);
/**
- * skb_append_datato_frags: - append the user data to a skb
+ * skb_append_datato_frags - append the user data to a skb
* @sk: sock structure
* @skb: skb structure to be appened with user data.
* @getfrag: call back function to be used for getting the user data
@@ -2408,52 +2793,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
int len, int odd, struct sk_buff *skb),
void *from, int length)
{
- int frg_cnt = 0;
- skb_frag_t *frag = NULL;
- struct page *page = NULL;
- int copy, left;
+ int frg_cnt = skb_shinfo(skb)->nr_frags;
+ int copy;
int offset = 0;
int ret;
+ struct page_frag *pfrag = &current->task_frag;
do {
/* Return error if we don't have space for new frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
if (frg_cnt >= MAX_SKB_FRAGS)
- return -EFAULT;
+ return -EMSGSIZE;
- /* allocate a new page for next frag */
- page = alloc_pages(sk->sk_allocation, 0);
-
- /* If alloc_page fails just return failure and caller will
- * free previous allocated pages by doing kfree_skb()
- */
- if (page == NULL)
+ if (!sk_page_frag_refill(sk, pfrag))
return -ENOMEM;
- /* initialize the next frag */
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
- skb->truesize += PAGE_SIZE;
- atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
-
- /* get the new initialized frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
- frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
-
/* copy the user data to page */
- left = PAGE_SIZE - frag->page_offset;
- copy = (length > left)? left : length;
+ copy = min_t(int, length, pfrag->size - pfrag->offset);
- ret = getfrag(from, (page_address(frag->page) +
- frag->page_offset + frag->size),
- offset, copy, 0, skb);
+ ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
+ offset, copy, 0, skb);
if (ret < 0)
return -EFAULT;
/* copy was successful so update the size parameters */
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
+ skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
+ copy);
+ frg_cnt++;
+ pfrag->offset += copy;
+ get_page(pfrag->page);
+
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
skb->len += copy;
skb->data_len += copy;
offset += copy;
@@ -2488,72 +2858,109 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
/**
* skb_segment - Perform protocol segmentation on skb.
- * @skb: buffer to segment
+ * @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
- struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
- unsigned int mss = skb_shinfo(skb)->gso_size;
- unsigned int doffset = skb->data - skb_mac_header(skb);
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ skb_frag_t *frag = skb_shinfo(head_skb)->frags;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+ struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int headroom;
unsigned int len;
- int sg = features & NETIF_F_SG;
- int nfrags = skb_shinfo(skb)->nr_frags;
+ __be16 proto;
+ bool csum;
+ int sg = !!(features & NETIF_F_SG);
+ int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
+ int dummy;
+
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ csum = !head_skb->encap_hdr_csum &&
+ !!can_checksum_protocol(features, proto);
- __skb_push(skb, doffset);
- headroom = skb_headroom(skb);
- pos = skb_headlen(skb);
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
do {
struct sk_buff *nskb;
- skb_frag_t *frag;
+ skb_frag_t *nskb_frag;
int hsize;
int size;
- len = skb->len - offset;
+ len = head_skb->len - offset;
if (len > mss)
len = mss;
- hsize = skb_headlen(skb) - offset;
+ hsize = skb_headlen(head_skb) - offset;
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg)
hsize = len;
- if (!hsize && i >= nfrags) {
- BUG_ON(fskb->len != len);
+ if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ size = skb_frag_size(frag);
+ if (pos + size > offset + len)
+ break;
- pos += len;
- nskb = skb_clone(fskb, GFP_ATOMIC);
- fskb = fskb->next;
+ i++;
+ pos += size;
+ frag++;
+ }
+
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
- hsize = skb_end_pointer(nskb) - nskb->head;
+ if (unlikely(pskb_trim(nskb, len))) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
- nskb->truesize += skb_end_pointer(nskb) - nskb->head -
- hsize;
+ nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
- nskb = alloc_skb(hsize + doffset + headroom,
- GFP_ATOMIC);
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+ NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
@@ -2568,121 +2975,133 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
segs = nskb;
tail = nskb;
- __copy_skb_header(nskb, skb);
- nskb->mac_len = skb->mac_len;
+ __copy_skb_header(nskb, head_skb);
+ nskb->mac_len = head_skb->mac_len;
- /* nskb and skb might have different headroom */
- if (nskb->ip_summed == CHECKSUM_PARTIAL)
- nskb->csum_start += skb_headroom(nskb) - headroom;
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
- skb_reset_mac_header(nskb);
- skb_set_network_header(nskb, skb->mac_len);
- nskb->transport_header = (nskb->network_header +
- skb_network_header_len(skb));
- skb_copy_from_linear_data(skb, nskb->data, doffset);
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
- if (fskb != skb_shinfo(skb)->frag_list)
- continue;
+ if (nskb->len == len + doffset)
+ goto perform_csum_check;
if (!sg) {
nskb->ip_summed = CHECKSUM_NONE;
- nskb->csum = skb_copy_and_csum_bits(skb, offset,
+ nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
continue;
}
- frag = skb_shinfo(nskb)->frags;
+ nskb_frag = skb_shinfo(nskb)->frags;
- skb_copy_from_linear_data_offset(skb, offset,
+ skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- while (pos < offset + len && i < nfrags) {
- *frag = skb_shinfo(skb)->frags[i];
- get_page(frag->page);
- size = frag->size;
+ skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
+
+ while (pos < offset + len) {
+ if (i >= nfrags) {
+ BUG_ON(skb_headlen(list_skb));
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+
+ BUG_ON(!nfrags);
+
+ list_skb = list_skb->next;
+ }
+
+ if (unlikely(skb_shinfo(nskb)->nr_frags >=
+ MAX_SKB_FRAGS)) {
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
+ goto err;
+ }
+
+ if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
+ goto err;
+
+ *nskb_frag = *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
if (pos < offset) {
- frag->page_offset += offset - pos;
- frag->size -= offset - pos;
+ nskb_frag->page_offset += offset - pos;
+ skb_frag_size_sub(nskb_frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) {
i++;
+ frag++;
pos += size;
} else {
- frag->size -= pos + size - (offset + len);
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
goto skip_fraglist;
}
- frag++;
- }
-
- if (pos < offset + len) {
- struct sk_buff *fskb2 = fskb;
-
- BUG_ON(pos + fskb->len != offset + len);
-
- pos += fskb->len;
- fskb = fskb->next;
-
- if (fskb2->next) {
- fskb2 = skb_clone(fskb2, GFP_ATOMIC);
- if (!fskb2)
- goto err;
- } else
- skb_get(fskb2);
-
- SKB_FRAG_ASSERT(nskb);
- skb_shinfo(nskb)->frag_list = fskb2;
+ nskb_frag++;
}
skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
- } while ((offset += len) < skb->len);
+
+perform_csum_check:
+ if (!csum) {
+ nskb->csum = skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ }
+ } while ((offset += len) < head_skb->len);
return segs;
err:
- while ((skb = segs)) {
- segs = skb->next;
- kfree_skb(skb);
- }
+ kfree_skb_list(segs);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);
int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
- struct sk_buff *p = *head;
- struct sk_buff *nskb;
- struct skb_shared_info *skbinfo = skb_shinfo(skb);
- struct skb_shared_info *pinfo = skb_shinfo(p);
- unsigned int headroom;
- unsigned int len = skb_gro_len(skb);
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
+ struct sk_buff *nskb, *lp, *p = *head;
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int headroom;
- if (p->len + len >= 65536)
+ if (unlikely(p->len + len >= 65536))
return -E2BIG;
- if (pinfo->frag_list)
- goto merge;
- else if (headlen <= offset) {
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
skb_frag_t *frag;
skb_frag_t *frag2;
int i = skbinfo->nr_frags;
int nr_frags = pinfo->nr_frags + i;
- offset -= headlen;
-
if (nr_frags > MAX_SKB_FRAGS)
- return -E2BIG;
+ goto merge;
+ offset -= headlen;
pinfo->nr_frags = nr_frags;
skbinfo->nr_frags = 0;
@@ -2693,15 +3112,48 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
} while (--i);
frag->page_offset += offset;
- frag->size -= offset;
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
skb->data_len = 0;
- NAPI_GRO_CB(skb)->free = 1;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
- } else if (skb_gro_len(p) != pinfo->gso_size)
+ }
+ if (pinfo->frag_list)
+ goto merge;
+ if (skb_gro_len(p) != pinfo->gso_size)
return -E2BIG;
headroom = skb_headroom(p);
@@ -2723,15 +3175,14 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
memcpy(skb_mac_header(nskb), skb_mac_header(p),
p->data - skb_mac_header(p));
- *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
skb_shinfo(nskb)->frag_list = p;
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
pinfo->gso_size = 0;
skb_header_release(p);
- nskb->prev = p;
+ NAPI_GRO_CB(nskb)->last = p;
nskb->data_len += p->len;
- nskb->truesize += p->len;
+ nskb->truesize += p->truesize;
nskb->len += p->len;
*head = nskb;
@@ -2741,24 +3192,37 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
p = nskb;
merge:
+ delta_truesize = skb->truesize;
if (offset > headlen) {
- skbinfo->frags[0].page_offset += offset - headlen;
- skbinfo->frags[0].size -= offset - headlen;
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
offset = headlen;
}
__skb_pull(skb, offset);
- p->prev->next = skb;
- p->prev = skb;
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
skb_header_release(skb);
+ lp = p;
done:
NAPI_GRO_CB(p)->count++;
p->data_len += len;
- p->truesize += len;
+ p->truesize += delta_truesize;
p->len += len;
-
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
@@ -2812,13 +3276,13 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- sg_set_page(&sg[elt], frag->page, copy,
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
frag->page_offset+offset-start);
elt++;
if (!(len -= copy))
@@ -2849,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
return elt;
}
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
int nsg = __skb_to_sgvec(skb, sg, offset, len);
@@ -2982,7 +3472,7 @@ static void sock_rmem_free(struct sk_buff *skb)
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf)
+ (unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
skb_orphan(skb);
@@ -2990,9 +3480,12 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb->destructor = sock_rmem_free;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ /* before exiting rcu section, make sure dst is refcounted */
+ skb_dst_force(skb);
+
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -3008,12 +3501,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb)
- return;
-
if (hwtstamps) {
- *skb_hwtstamps(skb) =
+ *skb_hwtstamps(orig_skb) =
*hwtstamps;
} else {
/*
@@ -3021,9 +3510,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
* so keep the shared tx_flags and only
* store software time stamp
*/
- skb->tstamp = ktime_get_real();
+ orig_skb->tstamp = ktime_get_real();
}
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
@@ -3036,6 +3529,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ err = sock_queue_err_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
@@ -3053,23 +3566,396 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
if (unlikely(start > skb_headlen(skb)) ||
unlikely((int)start + off > skb_headlen(skb) - 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "bad partial csum: csum=%u/%u len=%u\n",
- start, off, skb_headlen(skb));
+ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) + start;
skb->csum_offset = off;
+ skb_set_transport_header(skb, start);
return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
+ unsigned int max)
+{
+ if (skb_headlen(skb) >= len)
+ return 0;
+
+ /* If we need to pullup then pullup to the max, so we
+ * won't need to do it again.
+ */
+ if (max > skb->len)
+ max = skb->len;
+
+ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ if (skb_headlen(skb) < len)
+ return -EPROTO;
+
+ return 0;
+}
+
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * maximally sized IP and TCP or UDP headers.
+ */
+#define MAX_IP_HDR_LEN 128
+
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
+{
+ unsigned int off;
+ bool fragment;
+ __sum16 *csum;
+ int err;
+
+ fragment = false;
+
+ err = skb_maybe_pull_tail(skb,
+ sizeof(struct iphdr),
+ MAX_IP_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ fragment = true;
+
+ off = ip_hdrlen(skb);
+
+ err = -EPROTO;
+
+ if (fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * an IPv6 header, all options, and a maximal TCP or UDP header.
+ */
+#define MAX_IPV6_HDR_LEN 256
+
+#define OPT_HDR(type, skb, off) \
+ (type *)(skb_network_header(skb) + (off))
+
+static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+ u8 nexthdr;
+ unsigned int off;
+ unsigned int len;
+ bool fragment;
+ bool done;
+ __sum16 *csum;
+
+ fragment = false;
+ done = false;
+
+ off = sizeof(struct ipv6hdr);
+
+ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ while (off <= len && !done) {
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING: {
+ struct ipv6_opt_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ipv6_opt_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_optlen(hp);
+ break;
+ }
+ case IPPROTO_AH: {
+ struct ip_auth_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ip_auth_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ip_auth_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_authlen(hp);
+ break;
+ }
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct frag_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct frag_hdr, skb, off);
+
+ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
+ fragment = true;
+
+ nexthdr = hp->nexthdr;
+ off += sizeof(struct frag_hdr);
+ break;
+ }
+ default:
+ done = true;
+ break;
+ }
+ }
+
+ err = -EPROTO;
+
+ if (!done || fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * skb_checksum_setup - set up partial checksum offset
+ * @skb: the skb to set up
+ * @recalculate: if true the pseudo-header checksum will be recalculated
+ */
+int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ err = skb_checksum_setup_ipv4(skb, recalculate);
+ break;
+
+ case htons(ETH_P_IPV6):
+ err = skb_checksum_setup_ipv6(skb, recalculate);
+ break;
+
+ default:
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
- if (net_ratelimit())
- pr_warning("%s: received packets cannot be forwarded"
- " while LRO is enabled\n", skb->dev->name);
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen) {
+ skb_release_head_state(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+ } else {
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ if (skb_has_frag_list(to) || skb_has_frag_list(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+ skb_shinfo(from)->frags,
+ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+ if (!skb_cloned(from))
+ skb_shinfo(from)->nr_frags = 0;
+
+ /* if the skb is not cloned this does nothing
+ * since we set nr_frags to 0.
+ */
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+ skb_frag_ref(from, i);
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);
+
+/**
+ * skb_scrub_packet - scrub an skb
+ *
+ * @skb: buffer to clean
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
+ */
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ if (xnet)
+ skb_orphan(skb);
+ skb->tstamp.tv64 = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ skb->mark = 0;
+ secpath_reset(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+}
+EXPORT_SYMBOL_GPL(skb_scrub_packet);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ return tcp_hdrlen(skb) + shinfo->gso_size;
+
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return shinfo->gso_size;
+}
+EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
diff --git a/net/core/sock.c b/net/core/sock.c
index fb608011146..026e01f7027 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -89,8 +89,11 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
+#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
@@ -111,9 +114,11 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
@@ -125,13 +130,107 @@
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
#include <linux/filter.h>
+#include <trace/events/sock.h>
+
#ifdef CONFIG_INET
#include <net/tcp.h>
#endif
+#include <net/busy_poll.h>
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capbility to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socke was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
+#ifdef CONFIG_MEMCG_KMEM
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(memcg, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -139,6 +238,11 @@
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+#if defined(CONFIG_MEMCG_KMEM)
+struct static_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+#endif
+
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -157,8 +261,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
- "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" ,
- "sk_lock-AF_MAX"
+ "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -173,8 +277,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-27" , "slock-28" , "slock-AF_CAN" ,
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
- "slock-AF_IEEE802154", "slock-AF_CAIF" ,
- "slock-AF_MAX"
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -189,8 +293,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-27" , "clock-28" , "clock-AF_CAN" ,
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
- "clock-AF_IEEE802154", "clock-AF_CAIF" ,
- "clock-AF_MAX"
+ "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
};
/*
@@ -205,24 +309,76 @@ static struct lock_class_key af_callback_keys[AF_MAX];
* not depend upon such differences.
*/
#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
+#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-/* Maximal space eaten by iovec or ancilliary data plus some space */
+/* Maximal space eaten by iovec or ancillary data plus some space */
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
-#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
-int net_cls_subsys_id = -1;
-EXPORT_SYMBOL_GPL(net_cls_subsys_id);
-#endif
+struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(memalloc_socks);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_key_slow_inc(&memalloc_socks);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_key_slow_dec(&memalloc_socks);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. However, if SOCK_MEMALLOC is cleared while
+ * it has rmem allocations there is a risk that the user of the
+ * socket cannot make forward progress due to exceeding the rmem
+ * limits. By rights, sk_clear_memalloc() should only be called
+ * on sockets being torn down but warn and reset the accounting if
+ * that assumption breaks.
+ */
+ if (WARN_ON(sk->sk_forward_alloc))
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
@@ -241,9 +397,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
*timeo_p = 0;
if (warned < 10 && net_ratelimit()) {
warned++;
- printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
- "tries to set negative timeout\n",
- current->comm, task_pid_nr(current));
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
}
return 0;
}
@@ -261,20 +416,20 @@ static void sock_warn_obsolete_bsdism(const char *name)
static char warncomm[TASK_COMM_LEN];
if (strcmp(warncomm, current->comm) && warned < 5) {
strcpy(warncomm, current->comm);
- printk(KERN_WARNING "process `%s' is using obsolete "
- "%s SO_BSDCOMPAT\n", warncomm, name);
+ pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
+ warncomm, name);
warned++;
}
}
-static void sock_disable_timestamp(struct sock *sk, int flag)
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
- if (sock_flag(sk, flag)) {
- sock_reset_flag(sk, flag);
- if (!sock_flag(sk, SOCK_TIMESTAMP) &&
- !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
- }
}
}
@@ -286,12 +441,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
+ trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
@@ -299,7 +451,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
return err;
- if (!sk_rmem_schedule(sk, skb->truesize)) {
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
}
@@ -325,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
@@ -339,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, skb)) {
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -356,7 +508,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
rc = sk_backlog_rcv(sk, skb);
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb)) {
+ } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
@@ -372,19 +524,13 @@ discard_and_relse:
}
EXPORT_SYMBOL(sk_receive_skb);
-void sk_reset_txq(struct sock *sk)
-{
- sk_tx_queue_clear(sk);
-}
-EXPORT_SYMBOL(sk_reset_txq);
-
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- rcu_assign_pointer(sk->sk_dst_cache, NULL);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
}
@@ -407,7 +553,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+ int optlen)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -417,7 +564,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
/* Sorry... */
ret = -EPERM;
- if (!capable(CAP_NET_RAW))
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
@@ -464,6 +611,46 @@ out:
return ret;
}
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (sk->sk_bound_dev_if == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
{
if (valbool)
@@ -491,7 +678,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
*/
if (optname == SO_BINDTODEVICE)
- return sock_bindtodevice(sk, optval, optlen);
+ return sock_setbindtodevice(sk, optval, optlen);
if (optlen < sizeof(int))
return -EINVAL;
@@ -511,7 +698,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
- sk->sk_reuse = valbool;
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
break;
case SO_TYPE:
case SO_PROTOCOL:
@@ -527,23 +717,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
break;
case SO_SNDBUF:
/* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_wmem_max)
- val = sysctl_wmem_max;
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- if ((val * 2) < SOCK_MIN_SNDBUF)
- sk->sk_sndbuf = SOCK_MIN_SNDBUF;
- else
- sk->sk_sndbuf = val * 2;
-
- /*
- * Wake up sending tasks if we
- * upped the value.
- */
+ sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ /* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -556,12 +738,11 @@ set_sndbuf:
case SO_RCVBUF:
/* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_rmem_max)
- val = sysctl_rmem_max;
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_rmem_max);
set_rcvbuf:
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
/*
@@ -579,10 +760,7 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- if ((val * 2) < SOCK_MIN_RCVBUF)
- sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
- else
- sk->sk_rcvbuf = val * 2;
+ sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
break;
case SO_RCVBUFFORCE:
@@ -594,7 +772,8 @@ set_rcvbuf:
case SO_KEEPALIVE:
#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
tcp_set_keepalive(sk, valbool);
#endif
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
@@ -605,11 +784,12 @@ set_rcvbuf:
break;
case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ sk->sk_no_check_tx = valbool;
break;
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+ if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -679,7 +859,7 @@ set_rcvbuf:
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
val & SOF_TIMESTAMPING_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
@@ -719,6 +899,13 @@ set_rcvbuf:
ret = sk_detach_filter(sk);
break;
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
case SO_PASSSEC:
if (valbool)
set_bit(SOCK_PASSSEC, &sock->flags);
@@ -726,7 +913,7 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
ret = -EPERM;
else
sk->sk_mark = val;
@@ -735,11 +922,48 @@ set_rcvbuf:
/* We implement the SO_SNDLOWAT etc to
not be settable (1003.1g 5.3) */
case SO_RXQ_OVFL:
- if (valbool)
- sock_set_flag(sk, SOCK_RXQ_OVFL);
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_PEEK_OFF:
+ if (sock->ops->set_peek_off)
+ ret = sock->ops->set_peek_off(sk, val);
else
- sock_reset_flag(sk, SOCK_RXQ_OVFL);
+ ret = -EOPNOTSUPP;
break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ /* allow unprivileged users to decrease the value */
+ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ sk->sk_ll_usec = val;
+ }
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -750,19 +974,18 @@ set_rcvbuf:
EXPORT_SYMBOL(sock_setsockopt);
-void cred_to_ucred(struct pid *pid, const struct cred *cred,
- struct ucred *ucred)
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
{
ucred->pid = pid_vnr(pid);
ucred->uid = ucred->gid = -1;
if (cred) {
struct user_namespace *current_ns = current_user_ns();
- ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
- ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
}
}
-EXPORT_SYMBOL_GPL(cred_to_ucred);
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
@@ -795,7 +1018,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BROADCAST:
- v.val = !!sock_flag(sk, SOCK_BROADCAST);
+ v.val = sock_flag(sk, SOCK_BROADCAST);
break;
case SO_SNDBUF:
@@ -810,8 +1033,12 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reuse;
break;
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
case SO_KEEPALIVE:
- v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
break;
case SO_TYPE:
@@ -833,11 +1060,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_OOBINLINE:
- v.val = !!sock_flag(sk, SOCK_URGINLINE);
+ v.val = sock_flag(sk, SOCK_URGINLINE);
break;
case SO_NO_CHECK:
- v.val = sk->sk_no_check;
+ v.val = sk->sk_no_check_tx;
break;
case SO_PRIORITY:
@@ -846,7 +1073,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_LINGER:
lv = sizeof(v.ling);
- v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
v.ling.l_linger = sk->sk_lingertime / HZ;
break;
@@ -912,7 +1139,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSCRED:
- v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+ v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
break;
case SO_PEERCRED:
@@ -947,7 +1174,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSSEC:
- v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+ v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_PEERSEC:
@@ -958,7 +1185,53 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_RXQ_OVFL:
- v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
+
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
+ case SO_PEEK_OFF:
+ if (!sock->ops->set_peek_off)
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_peek_off;
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
+
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = sk->sk_ll_usec;
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
break;
default:
@@ -992,23 +1265,42 @@ static inline void sock_lock_init(struct sock *sk)
/*
* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
* even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
- BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
- sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
- sizeof(osk->sk_tx_queue_mapping));
- memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
security_sk_clone(osk, nsk);
#endif
}
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+ unsigned long nulls1, nulls2;
+
+ nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+ nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+ if (nulls1 > nulls2)
+ swap(nulls1, nulls2);
+
+ if (nulls1 != 0)
+ memset((char *)sk, 0, nulls1);
+ memset((char *)sk + nulls1 + sizeof(void *), 0,
+ nulls2 - nulls1 - sizeof(void *));
+ memset((char *)sk + nulls2 + sizeof(void *), 0,
+ size - nulls2 - sizeof(void *));
+}
+EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
@@ -1021,19 +1313,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!sk)
return sk;
if (priority & __GFP_ZERO) {
- /*
- * caches using SLAB_DESTROY_BY_RCU should let
- * sk_node.next un-modified. Special care is taken
- * when initializing object to zero.
- */
- if (offsetof(struct sock, sk_node.next) != 0)
- memset(sk, 0, offsetof(struct sock, sk_node.next));
- memset(&sk->sk_node.pprev, 0,
- prot->obj_size - offsetof(struct sock,
- sk_node.pprev));
+ if (prot->clear_sk)
+ prot->clear_sk(sk, prot->obj_size);
+ else
+ sk_prot_clear_nulls(sk, prot->obj_size);
}
- }
- else
+ } else
sk = kmalloc(prot->obj_size, priority);
if (sk != NULL) {
@@ -1075,18 +1360,15 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
module_put(owner);
}
-#ifdef CONFIG_CGROUPS
-void sock_update_classid(struct sock *sk)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+void sock_update_netprioidx(struct sock *sk)
{
- u32 classid;
+ if (in_interrupt())
+ return;
- rcu_read_lock(); /* doing current task, which cannot vanish. */
- classid = task_cls_classid(current);
- rcu_read_unlock();
- if (classid && classid != sk->sk_classid)
- sk->sk_classid = classid;
+ sk->sk_cgrp_prioidx = task_netprioidx(current);
}
-EXPORT_SYMBOL(sock_update_classid);
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
/**
@@ -1114,6 +1396,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
+ sock_update_netprioidx(sk);
}
return sk;
@@ -1131,15 +1414,14 @@ static void __sk_free(struct sock *sk)
atomic_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
- rcu_assign_pointer(sk->sk_filter, NULL);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk, SOCK_TIMESTAMP);
- sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
if (atomic_read(&sk->sk_omem_alloc))
- printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
- __func__, atomic_read(&sk->sk_omem_alloc));
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
@@ -1151,7 +1433,7 @@ static void __sk_free(struct sock *sk)
void sk_free(struct sock *sk)
{
/*
- * We substract one from sk_wmem_alloc and can know if
+ * We subtract one from sk_wmem_alloc and can know if
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
@@ -1161,10 +1443,10 @@ void sk_free(struct sock *sk)
EXPORT_SYMBOL(sk_free);
/*
- * Last sock_put should drop referrence to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking referrence to stopping namespace
+ * Last sock_put should drop reference to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking reference to stopping namespace
* is not an option.
- * Take referrence to a socket to remove it from hash _alive_ and after that
+ * Take reference to a socket to remove it from hash _alive_ and after that
* destroy it in the context of init_net.
*/
void sk_release_kernel(struct sock *sk)
@@ -1180,7 +1462,20 @@ void sk_release_kernel(struct sock *sk)
}
EXPORT_SYMBOL(sk_release_kernel);
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+static void sk_update_clone(const struct sock *sk, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+ sock_update_memcg(newsk);
+}
+
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
@@ -1233,6 +1528,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL;
goto out;
@@ -1262,17 +1558,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
+ sk_update_clone(sk, newsk);
+
if (newsk->sk_prot->sockets_allocated)
- percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+ sk_sockets_allocated_inc(newsk);
- if (sock_flag(newsk, SOCK_TIMESTAMP) ||
- sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
return newsk;
}
-EXPORT_SYMBOL_GPL(sk_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
@@ -1287,24 +1584,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
}
}
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
-void __init sk_init(void)
-{
- if (totalram_pages <= 4096) {
- sysctl_wmem_max = 32767;
- sysctl_rmem_max = 32767;
- sysctl_wmem_default = 32767;
- sysctl_rmem_default = 32767;
- } else if (totalram_pages >= 131072) {
- sysctl_wmem_max = 131071;
- sysctl_rmem_max = 131071;
- }
-}
-
/*
* Simple resource managers for sockets.
*/
@@ -1336,6 +1621,25 @@ void sock_wfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_wfree);
+void skb_orphan_partial(struct sk_buff *skb)
+{
+ /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+ * so we do not completely orphan skb, but transfert all
+ * accounted bytes but one, to avoid unexpected reorders.
+ */
+ if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+ || skb->destructor == tcp_wfree
+#endif
+ ) {
+ atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+ skb->truesize = 1;
+ } else {
+ skb_orphan(skb);
+ }
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
/*
* Read buffer destructor automatically called from kfree_skb.
*/
@@ -1349,13 +1653,25 @@ void sock_rfree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_rfree);
+void sock_edemux(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
-int sock_i_uid(struct sock *sk)
+#ifdef CONFIG_INET
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put(inet_twsk(sk));
+ else
+#endif
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+kuid_t sock_i_uid(struct sock *sk)
{
- int uid;
+ kuid_t uid;
read_lock_bh(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
read_unlock_bh(&sk->sk_callback_lock);
return uid;
}
@@ -1390,27 +1706,11 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
EXPORT_SYMBOL(sock_wmalloc);
/*
- * Allocate a skb from the socket's receive buffer.
- */
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
- gfp_t priority)
-{
- if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
- struct sk_buff *skb = alloc_skb(size, priority);
- if (skb) {
- skb_set_owner_r(skb, sk);
- return skb;
- }
- }
- return NULL;
-}
-
-/*
* Allocate a memory block from the socket's option memory buffer.
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned)size <= sysctl_optmem_max &&
+ if ((unsigned int)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
@@ -1470,19 +1770,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
- int *errcode)
+ int *errcode, int max_page_order)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ unsigned long chunk;
gfp_t gfp_mask;
long timeo;
int err;
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ struct page *page;
+ int i;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
+ err = -EMSGSIZE;
+ if (npages > MAX_SKB_FRAGS)
+ goto failure;
timeo = sock_sndtimeo(sk, noblock);
- while (1) {
+ while (!skb) {
err = sock_error(sk);
if (err != 0)
goto failure;
@@ -1491,54 +1795,54 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- skb = alloc_skb(header_len, gfp_mask);
- if (skb) {
- int npages;
- int i;
-
- /* No pages, we're done... */
- if (!data_len)
- break;
-
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- skb->truesize += data_len;
- skb_shinfo(skb)->nr_frags = npages;
- for (i = 0; i < npages; i++) {
- struct page *page;
- skb_frag_t *frag;
-
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- err = -ENOBUFS;
- skb_shinfo(skb)->nr_frags = i;
- kfree_skb(skb);
- goto failure;
- }
-
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
- data_len -= PAGE_SIZE;
- }
+ if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ continue;
+ }
- /* Full success... */
- break;
- }
- err = -ENOBUFS;
+ err = -ENOBUFS;
+ gfp_mask = sk->sk_allocation;
+ if (gfp_mask & __GFP_WAIT)
+ gfp_mask |= __GFP_REPEAT;
+
+ skb = alloc_skb(header_len, gfp_mask);
+ if (!skb)
goto failure;
+
+ skb->truesize += data_len;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(sk->sk_allocation |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ }
+ order--;
+ }
+ page = alloc_page(sk->sk_allocation);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
}
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = -EAGAIN;
- if (!timeo)
- goto failure;
- if (signal_pending(current))
- goto interrupted;
- timeo = sock_wait_for_wmem(sk, timeo);
}
skb_set_owner_w(skb, sk);
@@ -1547,6 +1851,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
interrupted:
err = sock_intr_errno(timeo);
failure:
+ kfree_skb(skb);
*errcode = err;
return NULL;
}
@@ -1555,10 +1860,66 @@ EXPORT_SYMBOL(sock_alloc_send_pskb);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode)
{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}
EXPORT_SYMBOL(sock_alloc_send_skb);
+/* On 32bit arches, an skb frag is limited to 2^15 */
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @prio: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+{
+ int order;
+
+ if (pfrag->page) {
+ if (atomic_read(&pfrag->page->_count) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ order = SKB_FRAG_PAGE_ORDER;
+ do {
+ gfp_t gfp = prio;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ pfrag->page = alloc_pages(gfp, order);
+ if (likely(pfrag->page)) {
+ pfrag->offset = 0;
+ pfrag->size = PAGE_SIZE << order;
+ return true;
+ }
+ } while (--order >= 0);
+
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -1590,6 +1951,7 @@ static void __release_sock(struct sock *sk)
do {
struct sk_buff *next = skb->next;
+ prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
@@ -1654,30 +2016,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
+ int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
/* Under limit. */
- if (allocated <= prot->sysctl_mem[0]) {
- if (prot->memory_pressure && *prot->memory_pressure)
- *prot->memory_pressure = 0;
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. */
- if (allocated > prot->sysctl_mem[1])
- if (prot->enter_memory_pressure)
- prot->enter_memory_pressure(sk);
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
- /* Over hard limit. */
- if (allocated > prot->sysctl_mem[2])
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
return 1;
+
} else { /* SK_MEM_SEND */
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1687,13 +2053,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
return 1;
}
- if (prot->memory_pressure) {
+ if (sk_has_memory_pressure(sk)) {
int alloc;
- if (!*prot->memory_pressure)
+ if (!sk_under_memory_pressure(sk))
return 1;
- alloc = percpu_counter_read_positive(prot->sockets_allocated);
- if (prot->sysctl_mem[2] > alloc *
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc))
@@ -1712,9 +2078,13 @@ suppress_allocation:
return 1;
}
+ trace_sock_exceed_buf_limit(sk, prot, allocated);
+
/* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
- atomic_long_sub(amt, prot->memory_allocated);
+
+ sk_memory_allocated_sub(sk, amt);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1725,15 +2095,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reclaim(struct sock *sk)
{
- struct proto *prot = sk->sk_prot;
-
- atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
- prot->memory_allocated);
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
- if (prot->memory_pressure && *prot->memory_pressure &&
- (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
- *prot->memory_pressure = 0;
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -1877,14 +2245,14 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
- wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
@@ -1936,7 +2304,7 @@ EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (timer_pending(timer) && del_timer(timer))
+ if (del_timer(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);
@@ -1981,8 +2349,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
@@ -1993,6 +2362,13 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_stamp = ktime_set(-1L, 0);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = sysctl_net_busy_read;
+#endif
+
+ sk->sk_max_pacing_rate = ~0U;
+ sk->sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -2029,7 +2405,14 @@ void release_sock(struct sock *sk)
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
- sk->sk_lock.owned = 0;
+
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
+ if (sk->sk_prot->release_cb)
+ sk->sk_prot->release_cb(sk);
+
+ sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
@@ -2104,20 +2487,65 @@ EXPORT_SYMBOL(sock_get_timestampns);
void sock_enable_timestamp(struct sock *sk, int flag)
{
if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
sock_set_flag(sk, flag);
/*
* we just set one of the two flags which require net
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!sock_flag(sk,
- flag == SOCK_TIMESTAMP ?
- SOCK_TIMESTAMPING_RX_SOFTWARE :
- SOCK_TIMESTAMP))
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->sk_error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ spin_lock_bh(&sk->sk_error_queue.lock);
+ sk->sk_err = 0;
+ if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+ sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+ sk->sk_error_report(sk);
+ } else
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
/*
* Get a socket option on an socket.
*
@@ -2221,13 +2649,16 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -2307,7 +2738,7 @@ static void assign_proto_idx(struct proto *prot)
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
- printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+ pr_err("PROTO_INUSE_NR exhausted\n");
return;
}
@@ -2337,8 +2768,8 @@ int proto_register(struct proto *prot, int alloc_slab)
NULL);
if (prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
goto out;
}
@@ -2352,8 +2783,8 @@ int proto_register(struct proto *prot, int alloc_slab)
SLAB_HWCACHE_ALIGN, NULL);
if (prot->rsk_prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
goto out_free_request_sock_slab_name;
}
}
@@ -2376,10 +2807,10 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
assign_proto_idx(prot);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
@@ -2402,10 +2833,10 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
release_proto_idx(prot);
list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -2428,9 +2859,9 @@ EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(proto_list_lock)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
return seq_list_start_head(&proto_list, *pos);
}
@@ -2440,25 +2871,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void proto_seq_stop(struct seq_file *seq, void *v)
- __releases(proto_list_lock)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
+
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
- proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
@@ -2524,7 +2966,7 @@ static const struct file_operations proto_seq_fops = {
static __net_init int proto_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
+ if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
return -ENOMEM;
return 0;
@@ -2532,7 +2974,7 @@ static __net_init int proto_init_net(struct net *net)
static __net_exit void proto_exit_net(struct net *net)
{
- proc_net_remove(net, "protocols");
+ remove_proc_entry("protocols", net->proc_net);
}
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 00000000000..a4216a4c957
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,231 @@
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+
+int sock_diag_check_cookie(void *sk, __u32 *cookie)
+{
+ if ((cookie[0] != INET_DIAG_NOCOOKIE ||
+ cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
+ return -ESTALE;
+ else
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(void *sk, __u32 *cookie)
+{
+ cookie[0] = (u32)(unsigned long)sk;
+ cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ u32 mem[SK_MEMINFO_VARS];
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+
+ return nla_put(skb, attrtype, sizeof(mem), &mem);
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+ struct sk_buff *skb, int attrtype)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ struct nlattr *attr;
+ unsigned int flen;
+ int err = 0;
+
+ if (!may_report_filterinfo) {
+ nla_reserve(skb, attrtype, 0);
+ return 0;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (!filter)
+ goto out;
+
+ fprog = filter->orig_prog;
+ flen = sk_filter_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
+ if (attr == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(nla_data(attr), fprog->filter, flen);
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(const struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(const struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = nlmsg_data(nlh);
+ const struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ if (req->sdiag_family >= AF_MAX)
+ return -EINVAL;
+
+ if (sock_diag_handlers[req->sdiag_family] == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, req->sdiag_family);
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[req->sdiag_family];
+ if (hndl == NULL)
+ err = -ENOENT;
+ else
+ err = hndl->dump(skb, nlh);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ return __sock_diag_rcv_msg(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+static int __net_init diag_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = sock_diag_rcv,
+ };
+
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+ return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->diag_nlsk);
+ net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+ .init = diag_net_init,
+ .exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+ return register_pernet_subsys(&diag_net_ops);
+}
+
+static void __exit sock_diag_exit(void)
+{
+ unregister_pernet_subsys(&diag_net_ops);
+}
+
+module_init(sock_diag_init);
+module_exit(sock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/stream.c b/net/core/stream.c
index f5df85dcd20..301c05f2606 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk)
struct socket *sock = sk->sk_socket;
struct socket_wq *wq;
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+ if (sk_stream_is_writeable(sk) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
@@ -122,7 +122,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
DEFINE_WAIT(wait);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
while (1) {
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 385b6095fdc..cf9cd13509a 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -14,17 +14,25 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/kmemleak.h>
#include <net/ip.h>
#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
+
+static int zero = 0;
+static int one = 1;
+static int ushort_max = USHRT_MAX;
#ifdef CONFIG_RPS
-static int rps_sock_flow_sysctl(ctl_table *table, int write,
+static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
- ctl_table tmp = {
+ struct ctl_table tmp = {
.data = &size,
.maxlen = sizeof(size),
.mode = table->mode
@@ -67,8 +75,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- synchronize_rcu();
- vfree(orig_sock_table);
+ if (sock_table)
+ static_key_slow_inc(&rps_needed);
+ if (orig_sock_table) {
+ static_key_slow_dec(&rps_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
}
}
@@ -78,6 +91,130 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
}
#endif /* CONFIG_RPS */
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse_user(buffer, *lenp, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ synchronize_rcu();
+ kfree(cur);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc_node(len, GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->num_buckets = netdev_flow_limit_table_len;
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ char kbuf[128];
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ goto done;
+ }
+
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = cpumask_scnprintf(kbuf, len, mask);
+ if (!len) {
+ *lenp = 0;
+ goto done;
+ }
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ if (copy_to_user(buffer, kbuf, len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
+
+static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
+
+ mutex_lock(&flow_limit_update_mutex);
+
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -85,28 +222,32 @@ static struct ctl_table net_core_table[] = {
.data = &sysctl_wmem_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "rmem_max",
.data = &sysctl_rmem_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "wmem_default",
.data = &sysctl_wmem_default,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "rmem_default",
.data = &sysctl_rmem_default,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "dev_weight",
@@ -122,6 +263,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_BPF_JIT
+ {
+ .procname = "bpf_jit_enable",
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
{
.procname = "netdev_tstamp_prequeue",
.data = &netdev_tstamp_prequeue,
@@ -158,6 +308,44 @@ static struct ctl_table net_core_table[] = {
.proc_handler = rps_sock_flow_sysctl
},
#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ {
+ .procname = "busy_poll",
+ .data = &sysctl_net_busy_poll,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "busy_read",
+ .data = &sysctl_net_busy_read,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
+ },
+#endif
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
@@ -182,17 +370,13 @@ static struct ctl_table netns_core_table[] = {
.data = &init_net.core.sysctl_somaxconn,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .extra1 = &zero,
+ .extra2 = &ushort_max,
+ .proc_handler = proc_dointvec_minmax
},
{ }
};
-__net_initdata struct ctl_path net_core_path[] = {
- { .procname = "net", },
- { .procname = "core", },
- { },
-};
-
static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
@@ -206,10 +390,14 @@ static __net_init int sysctl_core_net_init(struct net *net)
goto err_dup;
tbl[0].data = &net->core.sysctl_somaxconn;
+
+ /* Don't export any sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ tbl[0].procname = NULL;
+ }
}
- net->core.sysctl_hdr = register_net_sysctl_table(net,
- net_core_path, tbl);
+ net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
if (net->core.sysctl_hdr == NULL)
goto err_reg;
@@ -239,10 +427,7 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {
static __init int sysctl_core_init(void)
{
- static struct ctl_table empty[1];
-
- register_sysctl_paths(net_core_path, empty);
- register_net_sysctl_rotable(net_core_path, net_core_table);
+ register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops);
}
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index b124d28ff1c..6521dfd8b7c 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -21,17 +21,13 @@
#include <linux/phy.h>
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
-
-static struct sock_filter ptp_filter[] = {
- PTP_FILTER
-};
+#include <linux/export.h>
static unsigned int classify(const struct sk_buff *skb)
{
- if (likely(skb->dev &&
- skb->dev->phydev &&
+ if (likely(skb->dev && skb->dev->phydev &&
skb->dev->phydev->drv))
- return sk_run_filter(skb, ptp_filter);
+ return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
}
@@ -57,9 +53,15 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
case PTP_CLASS_V2_VLAN:
phydev = skb->dev->phydev;
if (likely(phydev->drv->txtstamp)) {
+ if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ return;
+
clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone)
+ if (!clone) {
+ sock_put(sk);
return;
+ }
+
clone->sk = sk;
phydev->drv->txtstamp(phydev, clone, type);
}
@@ -68,6 +70,7 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
break;
}
}
+EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
void skb_complete_tx_timestamp(struct sk_buff *skb,
struct skb_shared_hwtstamps *hwtstamps)
@@ -76,16 +79,23 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock_exterr_skb *serr;
int err;
- if (!hwtstamps)
+ if (!hwtstamps) {
+ sock_put(sk);
+ kfree_skb(skb);
return;
+ }
*skb_hwtstamps(skb) = *hwtstamps;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
skb->sk = NULL;
+
err = sock_queue_err_skb(sk, skb);
+
+ sock_put(sk);
if (err)
kfree_skb(skb);
}
@@ -96,11 +106,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
struct phy_device *phydev;
unsigned int type;
- skb_push(skb, ETH_HLEN);
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+ __skb_push(skb, ETH_HLEN);
type = classify(skb);
- skb_pull(skb, ETH_HLEN);
+ __skb_pull(skb, ETH_HLEN);
switch (type) {
case PTP_CLASS_V1_IPV4:
@@ -119,8 +131,4 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
return false;
}
-
-void __init skb_timestamping_init(void)
-{
- BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
-}
+EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 00000000000..8c3203c585b
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,77 @@
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/tso.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ iph = (struct iphdr *)(hdr + mac_hdr_len);
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ tcph->seq = htonl(tso->tcp_seq);
+ tso->ip_id++;
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 25d717ebc92..1b5fefdb819 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -27,6 +27,7 @@
#include <linux/dmaengine.h>
#include <linux/socket.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <net/netdma.h>
@@ -71,14 +72,14 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
copy = end - offset;
if (copy > 0) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
diff --git a/net/core/utils.c b/net/core/utils.c
index 5fea0ab2190..eed34338736 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/mm.h>
#include <linux/net.h>
@@ -27,9 +28,9 @@
#include <linux/ratelimit.h>
#include <net/sock.h>
+#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
int net_msg_warn __read_mostly = 1;
@@ -58,14 +59,11 @@ __be32 in_aton(const char *str)
int i;
l = 0;
- for (i = 0; i < 4; i++)
- {
+ for (i = 0; i < 4; i++) {
l <<= 8;
- if (*str != '\0')
- {
+ if (*str != '\0') {
val = 0;
- while (*str != '\0' && *str != '.' && *str != '\n')
- {
+ while (*str != '\0' && *str != '.' && *str != '\n') {
val *= 10;
val += *str - '0';
str++;
@@ -110,6 +108,18 @@ static inline int xdigit2bin(char c, int delim)
return IN6PTON_UNKNOWN;
}
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
int in4_pton(const char *src, int srclen,
u8 *dst,
int delim, const char **end)
@@ -164,6 +174,18 @@ out:
}
EXPORT_SYMBOL(in4_pton);
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
int in6_pton(const char *src, int srclen,
u8 *dst,
int delim, const char **end)
@@ -296,3 +318,72 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
csum_unfold(*sum)));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
+struct __net_random_once_work {
+ struct work_struct work;
+ struct static_key *key;
+};
+
+static void __net_random_once_deferred(struct work_struct *w)
+{
+ struct __net_random_once_work *work =
+ container_of(w, struct __net_random_once_work, work);
+ BUG_ON(!static_key_enabled(work->key));
+ static_key_slow_dec(work->key);
+ kfree(work);
+}
+
+static void __net_random_once_disable_jump(struct static_key *key)
+{
+ struct __net_random_once_work *w;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return;
+
+ INIT_WORK(&w->work, __net_random_once_deferred);
+ w->key = key;
+ schedule_work(&w->work);
+}
+
+bool __net_get_random_once(void *buf, int nbytes, bool *done,
+ struct static_key *once_key)
+{
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock, flags);
+ if (*done) {
+ spin_unlock_irqrestore(&lock, flags);
+ return false;
+ }
+
+ get_random_bytes(buf, nbytes);
+ *done = true;
+ spin_unlock_irqrestore(&lock, flags);
+
+ __net_random_once_disable_jump(once_key);
+
+ return true;
+}
+EXPORT_SYMBOL(__net_get_random_once);