aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile20
-rw-r--r--net/core/datagram.c592
-rw-r--r--net/core/dev.c7164
-rw-r--r--net/core/dev_addr_lists.c851
-rw-r--r--net/core/dev_ioctl.c567
-rw-r--r--net/core/dev_mcast.c298
-rw-r--r--net/core/drop_monitor.c439
-rw-r--r--net/core/dst.c395
-rw-r--r--net/core/dv.c549
-rw-r--r--net/core/ethtool.c1705
-rw-r--r--net/core/fib_rules.c805
-rw-r--r--net/core/filter.c1842
-rw-r--r--net/core/flow.c574
-rw-r--r--net/core/flow_dissector.c405
-rw-r--r--net/core/gen_estimator.c219
-rw-r--r--net/core/gen_stats.c66
-rw-r--r--net/core/iovec.c141
-rw-r--r--net/core/link_watch.c271
-rw-r--r--net/core/neighbour.c2589
-rw-r--r--net/core/net-procfs.c423
-rw-r--r--net/core/net-sysfs.c1405
-rw-r--r--net/core/net-sysfs.h11
-rw-r--r--net/core/net-traces.c37
-rw-r--r--net/core/net_namespace.c676
-rw-r--r--net/core/netclassid_cgroup.c111
-rw-r--r--net/core/netevent.c70
-rw-r--r--net/core/netpoll.c1079
-rw-r--r--net/core/netprio_cgroup.c288
-rw-r--r--net/core/pktgen.c4685
-rw-r--r--net/core/ptp_classifier.c141
-rw-r--r--net/core/request_sock.c174
-rw-r--r--net/core/rtnetlink.c3065
-rw-r--r--net/core/scm.c114
-rw-r--r--net/core/secure_seq.c173
-rw-r--r--net/core/skbuff.c3304
-rw-r--r--net/core/sock.c2761
-rw-r--r--net/core/sock_diag.c231
-rw-r--r--net/core/stream.c137
-rw-r--r--net/core/sysctl_net_core.c398
-rw-r--r--net/core/timestamping.c134
-rw-r--r--net/core/tso.c77
-rw-r--r--net/core/user_dma.c131
-rw-r--r--net/core/utils.c459
-rw-r--r--net/core/wireless.c1542
44 files changed, 29880 insertions, 11238 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 630da0f0579..71093d94ad2 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,16 +3,24 @@
#
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o ethtool.o dev_mcast.o dst.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o
-obj-$(CONFIG_SYSFS) += net-sysfs.o
-obj-$(CONFIG_NET_DIVERT) += dv.o
+obj-y += net-sysfs.o
+obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
-obj-$(CONFIG_NET_RADIO) += wireless.o
obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f8d322e1ea9..488dd1a825c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -9,7 +9,7 @@
* identical recvmsg() code. So we share it here. The poll was
* shared before but buried in udp.c so I moved it.
*
- * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
* udp.c code)
*
* Fixes:
@@ -37,7 +37,6 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
@@ -48,6 +47,8 @@
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -55,6 +56,8 @@
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
/*
* Is a socket 'connection oriented' ?
@@ -64,22 +67,35 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
+static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ unsigned long bits = (unsigned long)key;
+
+ /*
+ * Avoid a wakeup if event not interesting for us
+ */
+ if (bits && !(bits & (POLLIN | POLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
/*
- * Wait for a packet..
+ * Wait for the last received packet to be different from skb
*/
-static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
{
int error;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, receiver_wake_function);
- prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
error = sock_error(sk);
if (error)
goto out_err;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (sk->sk_receive_queue.prev != skb)
goto out;
/* Socket shut down? */
@@ -101,7 +117,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
@@ -115,10 +131,12 @@ out_noerr:
}
/**
- * skb_recv_datagram - Receive a datagram skbuff
+ * __skb_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
- * @noblock: blocking operation?
+ * @peeked: returns non-zero if this packet has been seen before
+ * @off: an offset in bytes to peek skb from. Returns an offset
+ * within an skb where data actually starts
* @err: error code returned
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
@@ -143,10 +161,10 @@ out_noerr:
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
- int noblock, int *err)
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
@@ -156,37 +174,51 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
if (error)
goto no_packet;
- timeo = sock_rcvtimeo(sk, noblock);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
- * However, this function was corrent in any case. 8)
+ * However, this function was correct in any case. 8)
*/
- if (flags & MSG_PEEK) {
- unsigned long cpu_flags;
-
- spin_lock_irqsave(&sk->sk_receive_queue.lock,
- cpu_flags);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb)
+ unsigned long cpu_flags;
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
+ int _off = *off;
+
+ last = (struct sk_buff *)queue;
+ spin_lock_irqsave(&queue->lock, cpu_flags);
+ skb_queue_walk(queue, skb) {
+ last = skb;
+ *peeked = skb->peeked;
+ if (flags & MSG_PEEK) {
+ if (_off >= skb->len && (skb->len || _off ||
+ skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
+ skb->peeked = 1;
atomic_inc(&skb->users);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
- cpu_flags);
- } else
- skb = skb_dequeue(&sk->sk_receive_queue);
+ } else
+ __skb_unlink(skb, queue);
- if (skb)
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ *off = _off;
return skb;
+ }
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+
+ if (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, flags & MSG_DONTWAIT))
+ continue;
/* User doesn't want to wait */
error = -EAGAIN;
if (!timeo)
goto no_packet;
- } while (!wait_for_packet(sk, err, &timeo));
+ } while (!wait_for_more_packets(sk, err, &timeo, last));
return NULL;
@@ -194,11 +226,43 @@ no_packet:
*err = error;
return NULL;
}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int noblock, int *err)
+{
+ int peeked, off = 0;
+
+ return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &off, err);
+}
+EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
- kfree_skb(skb);
+ consume_skb(skb);
+ sk_mem_reclaim_partial(sk);
}
+EXPORT_SYMBOL(skb_free_datagram);
+
+void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+{
+ bool slow;
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+
+ slow = lock_sock_fast(sk);
+ skb_orphan(skb);
+ sk_mem_reclaim_partial(sk);
+ unlock_sock_fast(sk, slow);
+
+ /* skb is now orphaned, can be freed outside of locked section */
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_free_datagram_locked);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
@@ -217,22 +281,31 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
* This function currently only disables BH when acquiring the
* sk_receive_queue lock. Therefore it must not be used in a
* context where that lock is acquired in an IRQ context.
+ *
+ * It returns 0 if the packet was removed by us.
*/
-void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
+ int err = 0;
+
if (flags & MSG_PEEK) {
+ err = -ENOENT;
spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
__skb_unlink(skb, &sk->sk_receive_queue);
atomic_dec(&skb->users);
+ err = 0;
}
spin_unlock_bh(&sk->sk_receive_queue.lock);
}
kfree_skb(skb);
-}
+ atomic_inc(&sk->sk_drops);
+ sk_mem_reclaim_partial(sk);
+ return err;
+}
EXPORT_SYMBOL(skb_kill_datagram);
/**
@@ -247,60 +320,339 @@ EXPORT_SYMBOL(skb_kill_datagram);
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
- int i, err, fraglen, end = 0;
- struct sk_buff *next = skb_shinfo(skb)->frag_list;
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ trace_skb_copy_datagram_iovec(skb, len);
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_toiovec(to, skb->data + offset, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ struct page *page = skb_frag_page(frag);
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_toiovec(to, vaddr + frag->page_offset +
+ offset - start, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_iovec(frag_iter,
+ offset - start,
+ to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
if (!len)
return 0;
-next_skb:
- fraglen = skb_headlen(skb);
- i = -1;
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iovec);
- while (1) {
- int start = end;
+/**
+ * skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: io vector to copy to
+ * @to_offset: offset in the io vector to start copying to
+ * @len: amount of data to copy from buffer to iovec
+ *
+ * Returns 0 or -EFAULT.
+ * Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
+ const struct iovec *to, int to_offset,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
- if ((end += fraglen) > offset) {
- int copy = end - offset, o = offset - start;
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
- if (i == -1)
- err = memcpy_toiovec(to, skb->data + o, copy);
- else {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
- void *p = kmap(page) + frag->page_offset + o;
- err = memcpy_toiovec(to, p, copy);
- kunmap(page);
- }
+ vaddr = kmap(page);
+ err = memcpy_toiovecend(to, vaddr + frag->page_offset +
+ offset - start, to_offset, copy);
+ kunmap(page);
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
+ to_offset += copy;
}
- if (++i >= skb_shinfo(skb)->nr_frags)
- break;
- fraglen = skb_shinfo(skb)->frags[i].size;
+ start = end;
}
- if (next) {
- skb = next;
- BUG_ON(skb_shinfo(skb)->frag_list);
- next = skb->next;
- goto next_skb;
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_const_iovec(frag_iter,
+ offset - start,
+ to, to_offset,
+ copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+ start = end;
}
+ if (!len)
+ return 0;
+
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+
+/**
+ * skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying to
+ * @from: io vector to copy to
+ * @from_offset: offset in the io vector to start copying from
+ * @len: amount of data to copy to buffer from iovec
+ *
+ * Returns 0 or -EFAULT.
+ * Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
+ const struct iovec *from, int from_offset,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
+ copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from_offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ struct page *page = skb_frag_page(frag);
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_fromiovecend(vaddr + frag->page_offset +
+ offset - start,
+ from, from_offset, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ from_offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_from_iovec(frag_iter,
+ offset - start,
+ from,
+ from_offset,
+ copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from_offset += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+
+/**
+ * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ * @skb: buffer to copy
+ * @from: io vector to copy from
+ * @offset: offset in the io vector to start copying from
+ * @count: amount of vectors to copy to buffer from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ * Note: the iovec is not modified during the copy
+ */
+int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+ int offset, size_t count)
+{
+ int len = iov_length(from, count) - offset;
+ int copy = min_t(int, skb_headlen(skb), len);
+ int size;
+ int i = 0;
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
+ return -EFAULT;
+
+ if (len == copy)
+ return 0;
+
+ offset += copy;
+ while (count--) {
+ struct page *page[MAX_SKB_FRAGS];
+ int num_pages;
+ unsigned long base;
+ unsigned long truesize;
+
+ /* Skip over from offset and copied */
+ if (offset >= from->iov_len) {
+ offset -= from->iov_len;
+ ++from;
+ continue;
+ }
+ len = from->iov_len - offset;
+ base = (unsigned long)from->iov_base + offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (i + size > MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+ if (num_pages != size) {
+ release_pages(&page[i], num_pages, 0);
+ return -EFAULT;
+ }
+ truesize = size * PAGE_SIZE;
+ skb->data_len += len;
+ skb->len += len;
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ while (len) {
+ int off = base & ~PAGE_MASK;
+ int size = min_t(int, len, PAGE_SIZE - off);
+ skb_fill_page_desc(skb, i, page[i], off, size);
+ base += size;
+ len -= size;
+ i++;
+ }
+ offset = 0;
+ ++from;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iovec);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
u8 __user *to, int len,
- unsigned int *csump)
+ __wsum *csump)
{
int start = skb_headlen(skb);
- int pos = 0;
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
/* Copy header. */
if (copy > 0) {
@@ -320,16 +672,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- unsigned int csum2;
+ __wsum csum2;
int err = 0;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -351,33 +703,29 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list=list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- unsigned int csum2 = 0;
- if (copy > len)
- copy = len;
- if (skb_copy_and_csum_datagram(list,
- offset - start,
- to, copy,
- &csum2))
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to += copy;
- pos += copy;
- }
- start = end;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ if (copy > len)
+ copy = len;
+ if (skb_copy_and_csum_datagram(frag_iter,
+ offset - start,
+ to, copy,
+ &csum2))
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ pos += copy;
}
+ start = end;
}
if (!len)
return 0;
@@ -386,16 +734,42 @@ fault:
return -EFAULT;
}
-unsigned int __skb_checksum_complete(struct sk_buff *skb)
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
- unsigned int sum;
+ __sum16 sum;
- sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
@@ -405,7 +779,7 @@ EXPORT_SYMBOL(__skb_checksum_complete);
* @skb: skbuff
* @hlen: hardware length
* @iov: io vector
- *
+ *
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
@@ -416,9 +790,12 @@ EXPORT_SYMBOL(__skb_checksum_complete);
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen, struct iovec *iov)
{
- unsigned int csum;
+ __wsum csum;
int chunk = skb->len - hlen;
+ if (!chunk)
+ return 0;
+
/* Skip filled elements.
* Pretty silly, look at memcpy_toiovec, though 8)
*/
@@ -435,9 +812,9 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
chunk, &csum))
goto fault;
- if ((unsigned short)csum_fold(csum))
+ if (csum_fold(csum))
goto csum_error;
- if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
netdev_rx_csum_fault(skb->dev);
iov->iov_len -= chunk;
iov->iov_base += chunk;
@@ -448,6 +825,7 @@ csum_error:
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
/**
* datagram_poll - generic datagram poll
@@ -469,18 +847,21 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
unsigned int mask;
- poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sk_sleep(sk), wait);
mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
/* Connection-based need to check for termination and startup */
@@ -500,9 +881,4 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
return mask;
}
-
EXPORT_SYMBOL(datagram_poll);
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-EXPORT_SYMBOL(skb_free_datagram);
-EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index fd070a098f2..367a586d0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -73,14 +73,15 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
-#include <linux/config.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/mutex.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -90,72 +91,75 @@
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
#include <linux/notifier.h>
#include <linux/skbuff.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <linux/stat.h>
-#include <linux/if_bridge.h>
-#include <linux/divert.h>
#include <net/dst.h>
#include <net/pkt_sched.h>
#include <net/checksum.h>
+#include <net/xfrm.h>
#include <linux/highmem.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
-#ifdef CONFIG_NET_RADIO
-#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
#include <net/iw_handler.h>
-#endif /* CONFIG_NET_RADIO */
#include <asm/current.h>
-
-/*
- * The list of packet types we will receive (as opposed to discard)
- * and the routines to invoke.
- *
- * Why 16. Because with 16 the only overlap we get on a hash of the
- * low nibble of the protocol value is RARP/SNAP/X.25.
- *
- * NOTE: That is no longer true with the addition of VLAN tags. Not
- * sure which should go first, but I bet it won't make much
- * difference if we are running VLANs. The good news is that
- * this protocol won't be in the list unless compiled in, so
- * the average user (w/out VLANs) will not be adversly affected.
- * --BLG
- *
- * 0800 IP
- * 8100 802.1Q VLAN
- * 0001 802.3
- * 0002 AX.25
- * 0004 802.2
- * 8035 RARP
- * 0005 SNAP
- * 0805 X.25
- * 0806 ARP
- * 8137 IPX
- * 0009 Localtalk
- * 86DD IPv6
- */
+#include <linux/audit.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
+#include <linux/pci.h>
+#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
+
+#include "net-sysfs.h"
+
+/* Instead of increasing this, you should create a hash table. */
+#define MAX_GRO_SKBS 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(ptype_lock);
-static struct list_head ptype_base[16]; /* 16 way hashed list */
-static struct list_head ptype_all; /* Taps */
+static DEFINE_SPINLOCK(offload_lock);
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+struct list_head ptype_all __read_mostly; /* Taps */
+static struct list_head offload_base __read_mostly;
+
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
/*
- * The @dev_base list is protected by @dev_base_lock and the rtln
+ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
* semaphore.
*
- * Pure readers hold dev_base_lock for reading.
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
*
* Writers must hold the rtnl semaphore while they loop through the
- * dev_base list, and hold dev_base_lock for writing when they do the
+ * dev_base_head list, and hold dev_base_lock for writing when they do the
* actual updates. This allows pure readers to access the list even
* while a writer is preparing to update it.
*
@@ -167,51 +171,178 @@ static struct list_head ptype_all; /* Taps */
* unregister_netdevice(), which must be called with the rtnl
* semaphore held.
*/
-struct net_device *dev_base;
-static struct net_device **dev_tail = &dev_base;
DEFINE_RWLOCK(dev_base_lock);
-
-EXPORT_SYMBOL(dev_base);
EXPORT_SYMBOL(dev_base_lock);
-#define NETDEV_HASHBITS 8
-static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
-static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
-static inline struct hlist_head *dev_name_hash(const char *name)
+static unsigned int napi_gen_id;
+static DEFINE_HASHTABLE(napi_hash, 8);
+
+static seqcount_t devnet_rename_seq;
+
+static inline void dev_base_seq_inc(struct net *net)
{
- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
- return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
+ while (++net->dev_base_seq == 0);
}
-static inline struct hlist_head *dev_index_hash(int ifindex)
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
- return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
+ unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+
+ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+static inline void rps_lock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_lock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+static inline void rps_unlock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_unlock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+/* Device list insertion */
+static void list_netdevice(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&dev_base_lock);
+ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(net);
+}
+
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
+static void unlist_netdevice(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ /* Unlink dev from the device chain */
+ write_lock_bh(&dev_base_lock);
+ list_del_rcu(&dev->dev_list);
+ hlist_del_rcu(&dev->name_hlist);
+ hlist_del_rcu(&dev->index_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(dev_net(dev));
}
/*
* Our notifier list
*/
-static struct notifier_block *netdev_chain;
+static RAW_NOTIFIER_HEAD(netdev_chain);
/*
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
-#ifdef CONFIG_SYSFS
-extern int netdev_sysfs_init(void);
-extern int netdev_register_sysfs(struct net_device *);
-extern void netdev_unregister_sysfs(struct net_device *);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+EXPORT_PER_CPU_SYMBOL(softnet_data);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] =
+ {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] =
+ {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
#else
-#define netdev_sysfs_init() (0)
-#define netdev_register_sysfs(dev) (0)
-#define netdev_unregister_sysfs(dev) do { } while(0)
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
#endif
-
/*******************************************************************************
Protocol management and registration routines
@@ -219,12 +350,6 @@ extern void netdev_unregister_sysfs(struct net_device *);
*******************************************************************************/
/*
- * For efficiency
- */
-
-int netdev_nit;
-
-/*
* Add a protocol ID to the list. Now that the input handler is
* smarter we can dispense with all the messy stuff that used to be
* here.
@@ -240,6 +365,14 @@ int netdev_nit;
* --ANK (980803)
*/
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+ if (pt->type == htons(ETH_P_ALL))
+ return &ptype_all;
+ else
+ return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
@@ -248,25 +381,20 @@ int netdev_nit;
* is linked into kernel lists and may not be freed until it has been
* removed from the kernel lists.
*
- * This call does not sleep therefore it can not
+ * This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new packet type (until the next received packet).
*/
void dev_add_pack(struct packet_type *pt)
{
- int hash;
+ struct list_head *head = ptype_head(pt);
- spin_lock_bh(&ptype_lock);
- if (pt->type == htons(ETH_P_ALL)) {
- netdev_nit++;
- list_add_rcu(&pt->list, &ptype_all);
- } else {
- hash = ntohs(pt->type) & 15;
- list_add_rcu(&pt->list, &ptype_base[hash]);
- }
- spin_unlock_bh(&ptype_lock);
+ spin_lock(&ptype_lock);
+ list_add_rcu(&pt->list, head);
+ spin_unlock(&ptype_lock);
}
+EXPORT_SYMBOL(dev_add_pack);
/**
* __dev_remove_pack - remove packet handler
@@ -275,7 +403,7 @@ void dev_add_pack(struct packet_type *pt)
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
- * returns.
+ * returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
@@ -283,16 +411,10 @@ void dev_add_pack(struct packet_type *pt)
*/
void __dev_remove_pack(struct packet_type *pt)
{
- struct list_head *head;
+ struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
- spin_lock_bh(&ptype_lock);
-
- if (pt->type == htons(ETH_P_ALL)) {
- netdev_nit--;
- head = &ptype_all;
- } else
- head = &ptype_base[ntohs(pt->type) & 15];
+ spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
if (pt == pt1) {
@@ -301,10 +423,12 @@ void __dev_remove_pack(struct packet_type *pt)
}
}
- printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+ pr_warn("dev_remove_pack: %p not found\n", pt);
out:
- spin_unlock_bh(&ptype_lock);
+ spin_unlock(&ptype_lock);
}
+EXPORT_SYMBOL(__dev_remove_pack);
+
/**
* dev_remove_pack - remove packet handler
* @pt: packet type declaration
@@ -320,9 +444,85 @@ out:
void dev_remove_pack(struct packet_type *pt)
{
__dev_remove_pack(pt);
-
+
synchronize_net();
}
+EXPORT_SYMBOL(dev_remove_pack);
+
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+
+ spin_lock(&offload_lock);
+ list_add_rcu(&po->list, head);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
/******************************************************************************
@@ -351,7 +551,7 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map)
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
memset(s[i].name, 0, sizeof(s[i].name));
- strcpy(s[i].name, name);
+ strlcpy(s[i].name, name, IFNAMSIZ);
memcpy(&s[i].map, map, sizeof(s[i].map));
break;
}
@@ -376,7 +576,7 @@ int netdev_boot_setup_check(struct net_device *dev)
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
- !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
+ !strcmp(dev->name, s[i].name)) {
dev->irq = s[i].map.irq;
dev->base_addr = s[i].map.base_addr;
dev->mem_start = s[i].map.mem_start;
@@ -386,6 +586,7 @@ int netdev_boot_setup_check(struct net_device *dev)
}
return 0;
}
+EXPORT_SYMBOL(netdev_boot_setup_check);
/**
@@ -410,7 +611,7 @@ unsigned long netdev_boot_base(const char *prefix, int unit)
* If device already registered then return base of 1
* to indicate not to probe for this interface
*/
- if (__dev_get_by_name(name))
+ if (__dev_get_by_name(&init_net, name))
return 1;
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
@@ -456,6 +657,7 @@ __setup("netdev=", netdev_boot_setup);
/**
* __dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. Must be called under RTNL semaphore
@@ -465,21 +667,47 @@ __setup("netdev=", netdev_boot_setup);
* careful with locks.
*/
-struct net_device *__dev_get_by_name(const char *name)
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct hlist_node *p;
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
- hlist_for_each(p, dev_name_hash(name)) {
- struct net_device *dev
- = hlist_entry(p, struct net_device, name_hlist);
+ hlist_for_each_entry(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
- }
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_name);
+
+/**
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
+
+ hlist_for_each_entry_rcu(dev, head, name_hlist)
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+
return NULL;
}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
/**
* dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. This can be called from any
@@ -489,20 +717,22 @@ struct net_device *__dev_get_by_name(const char *name)
* matching device is found.
*/
-struct net_device *dev_get_by_name(const char *name)
+struct net_device *dev_get_by_name(struct net *net, const char *name)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_name(name);
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
if (dev)
dev_hold(dev);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return dev;
}
+EXPORT_SYMBOL(dev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
@@ -512,22 +742,47 @@ struct net_device *dev_get_by_name(const char *name)
* or @dev_base_lock.
*/
-struct net_device *__dev_get_by_index(int ifindex)
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
- struct hlist_node *p;
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
- hlist_for_each(p, dev_index_hash(ifindex)) {
- struct net_device *dev
- = hlist_entry(p, struct net_device, index_hlist);
+ hlist_for_each_entry(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
- }
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_index);
+
+/**
+ * dev_get_by_index_rcu - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
return NULL;
}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
/**
* dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns NULL if the device
@@ -536,124 +791,184 @@ struct net_device *__dev_get_by_index(int ifindex)
* dev_put to indicate they have finished with it.
*/
-struct net_device *dev_get_by_index(int ifindex)
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifindex);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
dev_hold(dev);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return dev;
}
+EXPORT_SYMBOL(dev_get_by_index);
/**
- * dev_getbyhwaddr - find a device by its hardware address
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ *
+ * The use of raw_seqcount_begin() and cond_resched() before
+ * retrying is required as we want to give the writers a chance
+ * to complete when CONFIG_PREEMPT is not set.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ unsigned int seq;
+
+retry:
+ seq = raw_seqcount_begin(&devnet_rename_seq);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ strcpy(name, dev->name);
+ rcu_read_unlock();
+ if (read_seqcount_retry(&devnet_rename_seq, seq)) {
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_getbyhwaddr_rcu - find a device by its hardware address
+ * @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold the
- * rtnl semaphore. The returned device has not had its ref count increased
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
+ * The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
- * BUGS:
- * If the API was consistent this would be __dev_get_by_hwaddr
*/
-struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+ const char *ha)
{
struct net_device *dev;
- ASSERT_RTNL();
-
- for (dev = dev_base; dev; dev = dev->next)
+ for_each_netdev_rcu(net, dev)
if (dev->type == type &&
!memcmp(dev->dev_addr, ha, dev->addr_len))
- break;
- return dev;
-}
+ return dev;
-EXPORT_SYMBOL(dev_getbyhwaddr);
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
-struct net_device *dev_getfirstbyhwtype(unsigned short type)
+struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev;
- rtnl_lock();
- for (dev = dev_base; dev; dev = dev->next) {
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev->type == type)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev, *ret = NULL;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev)
if (dev->type == type) {
dev_hold(dev);
+ ret = dev;
break;
}
- }
- rtnl_unlock();
- return dev;
+ rcu_read_unlock();
+ return ret;
}
-
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * dev_get_by_flags - find any device with given flags
+ * dev_get_by_flags_rcu - find any device with given flags
+ * @net: the applicable net namespace
* @if_flags: IFF_* values
* @mask: bitmask of bits in if_flags to check
*
* Search for any interface with the given flags. Returns NULL if a device
- * is not found or a pointer to the device. The device returned has
- * had a reference added and the pointer is safe until the user calls
- * dev_put to indicate they have finished with it.
+ * is not found or a pointer to the device. Must be called inside
+ * rcu_read_lock(), and result refcount is unchanged.
*/
-struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
+ unsigned short mask)
{
- struct net_device *dev;
+ struct net_device *dev, *ret;
- read_lock(&dev_base_lock);
- for (dev = dev_base; dev != NULL; dev = dev->next) {
+ ret = NULL;
+ for_each_netdev_rcu(net, dev) {
if (((dev->flags ^ if_flags) & mask) == 0) {
- dev_hold(dev);
+ ret = dev;
break;
}
}
- read_unlock(&dev_base_lock);
- return dev;
+ return ret;
}
+EXPORT_SYMBOL(dev_get_by_flags_rcu);
/**
* dev_valid_name - check if name is okay for network device
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work
+ * to allow sysfs to work. We also disallow any kind of
+ * whitespace.
*/
-int dev_valid_name(const char *name)
+bool dev_valid_name(const char *name)
{
- return !(*name == '\0'
- || !strcmp(name, ".")
- || !strcmp(name, "..")
- || strchr(name, '/'));
+ if (*name == '\0')
+ return false;
+ if (strlen(name) >= IFNAMSIZ)
+ return false;
+ if (!strcmp(name, ".") || !strcmp(name, ".."))
+ return false;
+
+ while (*name) {
+ if (*name == '/' || isspace(*name))
+ return false;
+ name++;
+ }
+ return true;
}
+EXPORT_SYMBOL(dev_valid_name);
/**
- * dev_alloc_name - allocate a name for a device
- * @dev: device
+ * __dev_alloc_name - allocate a name for a device
+ * @net: network namespace to allocate the device name in
* @name: name format string
+ * @buf: scratch buffer and result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
- * id. Not efficient for many devices, not called a lot. The caller
- * must hold the dev_base or rtnl lock while allocating the name and
- * adding the device in order to avoid duplicates. Returns the number
- * of the unit assigned or a negative errno code.
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
*/
-int dev_alloc_name(struct net_device *dev, const char *name)
+static int __dev_alloc_name(struct net *net, const char *name, char *buf)
{
int i = 0;
- char buf[IFNAMSIZ];
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
- long *inuse;
+ unsigned long *inuse;
struct net_device *d;
p = strnchr(name, IFNAMSIZ-1, '%');
@@ -667,18 +982,18 @@ int dev_alloc_name(struct net_device *dev, const char *name)
return -EINVAL;
/* Use one page as a bit array of possible slots */
- inuse = (long *) get_zeroed_page(GFP_ATOMIC);
+ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
if (!inuse)
return -ENOMEM;
- for (d = dev_base; d; d = d->next) {
+ for_each_netdev(net, d) {
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
/* avoid cases where sscanf is not exact inverse of printf */
- snprintf(buf, sizeof(buf), name, i);
+ snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
set_bit(i, inuse);
}
@@ -687,11 +1002,10 @@ int dev_alloc_name(struct net_device *dev, const char *name)
free_page((unsigned long) inuse);
}
- snprintf(buf, sizeof(buf), name, i);
- if (!__dev_get_by_name(buf)) {
- strlcpy(dev->name, buf, IFNAMSIZ);
+ if (buf != name)
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!__dev_get_by_name(net, buf))
return i;
- }
/* It is possible to run out of possible slots
* when the name is long and there isn't enough space left
@@ -700,6 +1014,66 @@ int dev_alloc_name(struct net_device *dev, const char *name)
return -ENFILE;
}
+/**
+ * dev_alloc_name - allocate a name for a device
+ * @dev: device
+ * @name: name format string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+int dev_alloc_name(struct net_device *dev, const char *name)
+{
+ char buf[IFNAMSIZ];
+ struct net *net;
+ int ret;
+
+ BUG_ON(!dev_net(dev));
+ net = dev_net(dev);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+EXPORT_SYMBOL(dev_alloc_name);
+
+static int dev_alloc_name_ns(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ char buf[IFNAMSIZ];
+ int ret;
+
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+static int dev_get_valid_name(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ BUG_ON(!net);
+
+ if (!dev_valid_name(name))
+ return -EINVAL;
+
+ if (strchr(name, '%'))
+ return dev_alloc_name_ns(net, dev, name);
+ else if (__dev_get_by_name(net, name))
+ return -EEXIST;
+ else if (dev->name != name)
+ strlcpy(dev->name, name, IFNAMSIZ);
+
+ return 0;
+}
/**
* dev_change_name - change name of a device
@@ -709,48 +1083,119 @@ int dev_alloc_name(struct net_device *dev, const char *name)
* Change name of a device, can pass format strings "eth%d".
* for wildcarding.
*/
-int dev_change_name(struct net_device *dev, char *newname)
+int dev_change_name(struct net_device *dev, const char *newname)
{
+ char oldname[IFNAMSIZ];
int err = 0;
+ int ret;
+ struct net *net;
ASSERT_RTNL();
+ BUG_ON(!dev_net(dev));
+ net = dev_net(dev);
if (dev->flags & IFF_UP)
return -EBUSY;
- if (!dev_valid_name(newname))
- return -EINVAL;
+ write_seqcount_begin(&devnet_rename_seq);
- if (strchr(newname, '%')) {
- err = dev_alloc_name(dev, newname);
- if (err < 0)
- return err;
- strcpy(newname, dev->name);
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
+ write_seqcount_end(&devnet_rename_seq);
+ return 0;
}
- else if (__dev_get_by_name(newname))
- return -EEXIST;
- else
- strlcpy(dev->name, newname, IFNAMSIZ);
- err = class_device_rename(&dev->class_dev, dev->name);
- if (!err) {
- hlist_del(&dev->name_hlist);
- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
- notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+ memcpy(oldname, dev->name, IFNAMSIZ);
+
+ err = dev_get_valid_name(net, dev, newname);
+ if (err < 0) {
+ write_seqcount_end(&devnet_rename_seq);
+ return err;
+ }
+
+rollback:
+ ret = device_rename(&dev->dev, dev->name);
+ if (ret) {
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ write_seqcount_end(&devnet_rename_seq);
+ return ret;
+ }
+
+ write_seqcount_end(&devnet_rename_seq);
+
+ netdev_adjacent_rename_links(dev, oldname);
+
+ write_lock_bh(&dev_base_lock);
+ hlist_del_rcu(&dev->name_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ synchronize_rcu();
+
+ write_lock_bh(&dev_base_lock);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ write_unlock_bh(&dev_base_lock);
+
+ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+ ret = notifier_to_errno(ret);
+
+ if (ret) {
+ /* err >= 0 after dev_alloc_name() or stores the first errno */
+ if (err >= 0) {
+ err = ret;
+ write_seqcount_begin(&devnet_rename_seq);
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ memcpy(oldname, newname, IFNAMSIZ);
+ goto rollback;
+ } else {
+ pr_err("%s: name change rollback failed: %d\n",
+ dev->name, ret);
+ }
}
return err;
}
/**
- * netdev_features_change - device changes fatures
+ * dev_set_alias - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ char *new_ifalias;
+
+ ASSERT_RTNL();
+
+ if (len >= IFALIASZ)
+ return -EINVAL;
+
+ if (!len) {
+ kfree(dev->ifalias);
+ dev->ifalias = NULL;
+ return 0;
+ }
+
+ new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
+ if (!new_ifalias)
+ return -ENOMEM;
+ dev->ifalias = new_ifalias;
+
+ strlcpy(dev->ifalias, alias, len+1);
+ return len;
+}
+
+
+/**
+ * netdev_features_change - device changes features
* @dev: device to cause notification
*
* Called to indicate a device has changed features.
*/
void netdev_features_change(struct net_device *dev)
{
- notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);
@@ -765,40 +1210,77 @@ EXPORT_SYMBOL(netdev_features_change);
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
- notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
+EXPORT_SYMBOL(netdev_state_change);
/**
- * dev_load - load a network module
- * @name: name of interface
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
*
- * If a network interface is not present and the process has suitable
- * privileges this function loads the module. If module loading is not
- * available in this kernel then it becomes a nop.
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
*/
+void netdev_notify_peers(struct net_device *dev)
+{
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(netdev_notify_peers);
-void dev_load(const char *name)
+static int __dev_open(struct net_device *dev)
{
- struct net_device *dev;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_name(name);
- read_unlock(&dev_base_lock);
+ ASSERT_RTNL();
- if (!dev && capable(CAP_SYS_MODULE))
- request_module("%s", name);
-}
+ if (!netif_device_present(dev))
+ return -ENODEV;
-static int default_rebuild_header(struct sk_buff *skb)
-{
- printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
- skb->dev ? skb->dev->name : "NULL!!!");
- kfree_skb(skb);
- return 1;
-}
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
+ */
+ netpoll_poll_disable(dev);
+ ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ if (ops->ndo_validate_addr)
+ ret = ops->ndo_validate_addr(dev);
+
+ if (!ret && ops->ndo_open)
+ ret = ops->ndo_open(dev);
+
+ netpoll_poll_enable(dev);
+
+ if (ret)
+ clear_bit(__LINK_STATE_START, &dev->state);
+ else {
+ dev->flags |= IFF_UP;
+ net_dmaengine_get();
+ dev_set_rx_mode(dev);
+ dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ }
+
+ return ret;
+}
/**
* dev_open - prepare an interface for use.
@@ -814,57 +1296,99 @@ static int default_rebuild_header(struct sk_buff *skb)
*/
int dev_open(struct net_device *dev)
{
- int ret = 0;
-
- /*
- * Is it already up?
- */
+ int ret;
if (dev->flags & IFF_UP)
return 0;
- /*
- * Is it even present?
- */
- if (!netif_device_present(dev))
- return -ENODEV;
+ ret = __dev_open(dev);
+ if (ret < 0)
+ return ret;
- /*
- * Call device private open method
- */
- set_bit(__LINK_STATE_START, &dev->state);
- if (dev->open) {
- ret = dev->open(dev);
- if (ret)
- clear_bit(__LINK_STATE_START, &dev->state);
- }
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_UP, dev);
- /*
- * If it went open OK then:
- */
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
- if (!ret) {
- /*
- * Set the flags.
- */
- dev->flags |= IFF_UP;
+static int __dev_close_many(struct list_head *head)
+{
+ struct net_device *dev;
- /*
- * Initialize multicasting status
- */
- dev_mc_upload(dev);
+ ASSERT_RTNL();
+ might_sleep();
- /*
- * Wakeup transmit queue engine
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+ clear_bit(__LINK_STATE_START, &dev->state);
+
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
*/
- dev_activate(dev);
+ smp_mb__after_atomic(); /* Commit netif_running(). */
+ }
+
+ dev_deactivate_many(head);
+
+ list_for_each_entry(dev, head, close_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
/*
- * ... and announce new interface.
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
*/
- notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
+
+ dev->flags &= ~IFF_UP;
+ net_dmaengine_put();
+ netpoll_poll_enable(dev);
}
- return ret;
+
+ return 0;
+}
+
+static int __dev_close(struct net_device *dev)
+{
+ int retval;
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+
+ return retval;
+}
+
+static int dev_close_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
+ if (!(dev->flags & IFF_UP))
+ list_del_init(&dev->close_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ list_del_init(&dev->close_list);
+ }
+
+ return 0;
}
/**
@@ -878,60 +1402,57 @@ int dev_open(struct net_device *dev)
*/
int dev_close(struct net_device *dev)
{
- if (!(dev->flags & IFF_UP))
- return 0;
-
- /*
- * Tell people we are going down, so that they can
- * prepare to death, when device is still operating.
- */
- notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
-
- dev_deactivate(dev);
-
- clear_bit(__LINK_STATE_START, &dev->state);
-
- /* Synchronize to scheduled poll. We cannot touch poll list,
- * it can be even on different cpu. So just clear netif_running(),
- * and wait when poll really will happen. Actually, the best place
- * for this is inside dev->stop() after device stopped its irq
- * engine, but this requires more changes in devices. */
+ if (dev->flags & IFF_UP) {
+ LIST_HEAD(single);
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
- while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
- /* No hurry. */
- msleep(1);
+ list_add(&dev->close_list, &single);
+ dev_close_many(&single);
+ list_del(&single);
}
+ return 0;
+}
+EXPORT_SYMBOL(dev_close);
- /*
- * Call the device specific close. This cannot fail.
- * Only if device is UP
- *
- * We allow it to be called even after a DETACH hot-plug
- * event.
- */
- if (dev->stop)
- dev->stop(dev);
+/**
+ * dev_disable_lro - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
/*
- * Device is now down.
+ * If we're trying to disable lro on a vlan device
+ * use the underlying physical device instead
*/
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
- dev->flags &= ~IFF_UP;
+ /* the same for macvlan devices */
+ if (netif_is_macvlan(dev))
+ dev = macvlan_dev_real_dev(dev);
- /*
- * Tell people we are down
- */
- notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
- return 0;
+ if (unlikely(dev->features & NETIF_F_LRO))
+ netdev_WARN(dev, "failed to disable LRO!\n");
}
+EXPORT_SYMBOL(dev_disable_lro);
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info;
-/*
- * Device change register/unregister. These are not inline or static
- * as we export them to the world.
- */
+ netdev_notifier_info_init(&info, dev);
+ return nb->notifier_call(nb, val, &info);
+}
+
+static int dev_boot_phase = 1;
/**
* register_netdevice_notifier - register a network notifier block
@@ -943,28 +1464,62 @@ int dev_close(struct net_device *dev)
* is returned on a failure.
*
* When registered all registration and up events are replayed
- * to the new notifier to allow device to have a race free
+ * to the new notifier to allow device to have a race free
* view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net_device *dev;
+ struct net_device *last;
+ struct net *net;
int err;
rtnl_lock();
- err = notifier_chain_register(&netdev_chain, nb);
- if (!err) {
- for (dev = dev_base; dev; dev = dev->next) {
- nb->notifier_call(nb, NETDEV_REGISTER, dev);
+ err = raw_notifier_chain_register(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+ if (dev_boot_phase)
+ goto unlock;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ goto rollback;
- if (dev->flags & IFF_UP)
- nb->notifier_call(nb, NETDEV_UP, dev);
+ if (!(dev->flags & IFF_UP))
+ continue;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
}
}
+
+unlock:
rtnl_unlock();
return err;
+
+rollback:
+ last = dev;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev == last)
+ goto outroll;
+
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+
+outroll:
+ raw_notifier_chain_unregister(&netdev_chain, nb);
+ goto unlock;
}
+EXPORT_SYMBOL(register_netdevice_notifier);
/**
* unregister_netdevice_notifier - unregister a network notifier block
@@ -974,57 +1529,213 @@ int register_netdevice_notifier(struct notifier_block *nb)
* register_netdevice_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{
- return notifier_chain_unregister(&netdev_chain, nb);
+ struct net_device *dev;
+ struct net *net;
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info)
+{
+ ASSERT_RTNL();
+ netdev_notifier_info_init(info, dev);
+ return raw_notifier_call_chain(&netdev_chain, val, info);
}
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
- * @v: pointer passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
*
* Call all network notifier blocks. Parameters and return value
- * are as for notifier_call_chain().
+ * are as for raw_notifier_call_chain().
*/
-int call_netdevice_notifiers(unsigned long val, void *v)
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- return notifier_call_chain(&netdev_chain, val, v);
+ struct netdev_notifier_info info;
+
+ return call_netdevice_notifiers_info(val, dev, &info);
}
+EXPORT_SYMBOL(call_netdevice_notifiers);
-/* When > 0 there are consumers of rx skb time stamps */
-static atomic_t netstamp_needed = ATOMIC_INIT(0);
+static struct static_key netstamp_needed __read_mostly;
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call static_key_slow_dec() from irq context
+ * If net_disable_timestamp() is called from irq context, defer the
+ * static_key_slow_dec() calls.
+ */
+static atomic_t netstamp_needed_deferred;
+#endif
void net_enable_timestamp(void)
{
- atomic_inc(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+
+ if (deferred) {
+ while (--deferred)
+ static_key_slow_dec(&netstamp_needed);
+ return;
+ }
+#endif
+ static_key_slow_inc(&netstamp_needed);
}
+EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
- atomic_dec(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&netstamp_needed_deferred);
+ return;
+ }
+#endif
+ static_key_slow_dec(&netstamp_needed);
}
+EXPORT_SYMBOL(net_disable_timestamp);
-void __net_timestamp(struct sk_buff *skb)
+static inline void net_timestamp_set(struct sk_buff *skb)
{
- struct timeval tv;
+ skb->tstamp.tv64 = 0;
+ if (static_key_false(&netstamp_needed))
+ __net_timestamp(skb);
+}
+
+#define net_timestamp_check(COND, SKB) \
+ if (static_key_false(&netstamp_needed)) { \
+ if ((COND) && !(SKB)->tstamp.tv64) \
+ __net_timestamp(SKB); \
+ } \
- do_gettimeofday(&tv);
- skb_set_timestamp(skb, &tv);
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+{
+ unsigned int len;
+
+ if (!(dev->flags & IFF_UP))
+ return false;
+
+ len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+ if (skb->len <= len)
+ return true;
+
+ /* if TSO is enabled, we don't care about the length as the packet
+ * could be forwarded without being segmented before
+ */
+ if (skb_is_gso(skb))
+ return true;
+
+ return false;
}
-EXPORT_SYMBOL(__net_timestamp);
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
-static inline void net_timestamp(struct sk_buff *skb)
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (atomic_read(&netstamp_needed))
- __net_timestamp(skb);
- else {
- skb->tstamp.off_sec = 0;
- skb->tstamp.off_usec = 0;
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
}
+
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
+
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped, but freed)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ return -ENOMEM;
+ atomic_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
+{
+ if (!ptype->af_packet_priv || !skb->sk)
+ return false;
+
+ if (ptype->id_match)
+ return ptype->id_match(ptype, skb->sk);
+ else if ((struct sock *)ptype->af_packet_priv == skb->sk)
+ return true;
+
+ return false;
}
/*
@@ -1032,11 +1743,11 @@ static inline void net_timestamp(struct sk_buff *skb)
* taps currently in use.
*/
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
-
- net_timestamp(skb);
+ struct sk_buff *skb2 = NULL;
+ struct packet_type *pt_prev = NULL;
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1044,179 +1755,1086 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if ((ptype->dev == dev || !ptype->dev) &&
- (ptype->af_packet_priv == NULL ||
- (struct sock *)ptype->af_packet_priv != skb->sk)) {
- struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
+ (!skb_loop_sk(ptype, skb))) {
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
break;
+ net_timestamp_set(skb2);
+
/* skb->nh should be correctly
set by sender, so that the second statement is
just protection against buggy protocols.
*/
- skb2->mac.raw = skb2->data;
-
- if (skb2->nh.raw < skb2->data ||
- skb2->nh.raw > skb2->tail) {
- if (net_ratelimit())
- printk(KERN_CRIT "protocol %04x is "
- "buggy, dev %s\n",
- skb2->protocol, dev->name);
- skb2->nh.raw = skb2->data;
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
+ skb_reset_network_header(skb2);
}
- skb2->h.raw = skb2->nh.raw;
+ skb2->transport_header = skb2->network_header;
skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype, skb->dev);
+ pt_prev = ptype;
}
}
+ if (pt_prev)
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
+#ifdef CONFIG_XPS
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int cpu, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ if (dev_maps)
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] == index) {
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ } else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ break;
+ }
+ }
+
+ return map;
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ struct xps_dev_maps *dev_maps;
+ int cpu, i;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = index; i < dev->num_tx_queues; i++) {
+ if (!remove_xps_queue(dev_maps, cpu, i))
+ break;
+ }
+ if (i == dev->num_tx_queues)
+ active = true;
+ }
+
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ for (i = index; i < dev->num_tx_queues; i++)
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map,
+ int cpu, u16 index)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add queue to this CPU's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store queue on this CPU's map */
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_map *map, *new_map;
+ int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
+ int cpu, numa_node_id = -2;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ /* allocate memory for queue storage */
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, mask))
+ continue;
+
+ if (!new_dev_maps)
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+
+ map = expand_xps_map(map, cpu, index);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
+ /* add queue to CPU maps */
+ int pos = 0;
+
+ map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
+#endif
+ } else if (dev_maps) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ }
+
+ rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+
+ /* Cleanup old maps */
+ if (dev_maps) {
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (map && map != new_map)
+ kfree_rcu(map, rcu);
+ }
+
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ? numa_node_id :
+ NUMA_NO_NODE);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes queue from unused CPUs */
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
+ continue;
+
+ if (remove_xps_queue(dev_maps, cpu, index))
+ active = true;
+ }
+
+ /* free map if not active */
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+ int rc;
+
+ if (txq < 1 || txq > dev->num_tx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+
+ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+ txq);
+ if (rc)
+ return rc;
+
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
+ if (txq < dev->real_num_tx_queues) {
+ qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
+ }
+
+ dev->real_num_tx_queues = txq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+
+#ifdef CONFIG_SYSFS
+/**
+ * netif_set_real_num_rx_queues - set actual number of RX queues used
+ * @dev: Network device
+ * @rxq: Actual number of RX queues
+ *
+ * This must be called either with the rtnl_lock held or before
+ * registration of the net device. Returns 0 on success, or a
+ * negative error code. If called before registration, it always
+ * succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+ int rc;
+
+ if (rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED) {
+ ASSERT_RTNL();
+
+ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+ rxq);
+ if (rc)
+ return rc;
+ }
+
+ dev->real_num_rx_queues = rxq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * This routine should set an upper limit on the number of RSS queues
+ * used by default by multiqueue devices.
+ */
+int netif_get_num_default_rss_queues(void)
+{
+ return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
+static inline void __netif_reschedule(struct Qdisc *q)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = &__get_cpu_var(softnet_data);
+ q->next_sched = NULL;
+ *sd->output_queue_tailp = q;
+ sd->output_queue_tailp = &q->next_sched;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+ __netif_reschedule(q);
+}
+EXPORT_SYMBOL(__netif_schedule);
+
+struct dev_kfree_skb_cb {
+ enum skb_free_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ unsigned long flags;
+
+ if (likely(atomic_read(&skb->users) == 1)) {
+ smp_rmb();
+ atomic_set(&skb->users, 0);
+ } else if (likely(!atomic_dec_and_test(&skb->users))) {
+ return;
+ }
+ get_kfree_skb_cb(skb)->reason = reason;
+ local_irq_save(flags);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_irq);
+
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ if (in_irq() || irqs_disabled())
+ __dev_kfree_skb_irq(skb, reason);
+ else
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_any);
+
+
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
+void netif_device_detach(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_stop_all_queues(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_detach);
+
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
+void netif_device_attach(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_wake_all_queues(dev);
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_attach);
+
+static void skb_warn_bad_offload(const struct sk_buff *skb)
+{
+ static const netdev_features_t null_features = 0;
+ struct net_device *dev = skb->dev;
+ const char *driver = "";
+
+ if (!net_ratelimit())
+ return;
+
+ if (dev && dev->dev.parent)
+ driver = dev_driver_string(dev->dev.parent);
+
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
+ "gso_type=%d ip_summed=%d\n",
+ driver, dev ? &dev->features : &null_features,
+ skb->sk ? &skb->sk->sk_route_caps : &null_features,
+ skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
+ skb_shinfo(skb)->gso_type, skb->ip_summed);
+}
+
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
*/
-int skb_checksum_help(struct sk_buff *skb, int inward)
+int skb_checksum_help(struct sk_buff *skb)
{
- unsigned int csum;
- int ret = 0, offset = skb->h.raw - skb->data;
+ __wsum csum;
+ int ret = 0, offset;
- if (inward) {
- skb->ip_summed = CHECKSUM_NONE;
- goto out;
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ goto out_set_summed;
+
+ if (unlikely(skb_shinfo(skb)->gso_size)) {
+ skb_warn_bad_offload(skb);
+ return -EINVAL;
}
- if (skb_cloned(skb)) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
if (ret)
goto out;
}
- BUG_ON(offset > (int)skb->len);
- csum = skb_checksum(skb, offset, skb->len-offset, 0);
+ offset = skb_checksum_start_offset(skb);
+ BUG_ON(offset >= skb_headlen(skb));
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset = skb->tail - skb->h.raw;
- BUG_ON(offset <= 0);
- BUG_ON(skb->csum + 2 > offset);
+ offset += skb->csum_offset;
+ BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__sum16))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
- *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
-out:
+out:
return ret;
}
+EXPORT_SYMBOL(skb_checksum_help);
+
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
+{
+ unsigned int vlan_depth = skb->mac_len;
+ __be16 type = skb->protocol;
+
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
+
+ eth = (struct ethhdr *)skb_mac_header(skb);
+ type = eth->h_proto;
+ }
+
+ /* if skb->protocol is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
+
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
+
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
+ }
+
+ *depth = vlan_depth;
+
+ return type;
+}
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ int err;
+
+ err = ptype->callbacks.gso_send_check(skb);
+ segs = ERR_PTR(err);
+ if (err || skb_gso_ok(skb, features))
+ break;
+ __skb_push(skb, (skb->data -
+ skb_network_header(skb)));
+ }
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL;
+ else
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ skb_warn_bad_offload(skb);
+
+ if (skb_header_cloned(skb) &&
+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return ERR_PTR(err);
+ }
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb_mac_gso_segment(skb, features);
+}
+EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
{
if (net_ratelimit()) {
- printk(KERN_ERR "%s: hw csum failure.\n",
- dev ? dev->name : "<unknown>");
+ pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
dump_stack();
}
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
-#ifdef CONFIG_HIGHMEM
/* Actually, we should eliminate this check as soon as we know, that:
* 1. IOMMU is present and allows to map all the memory.
* 2. No high memory really exists on this machine.
*/
-static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
+#ifdef CONFIG_HIGHMEM
int i;
+ if (!(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (PageHighMem(skb_frag_page(frag)))
+ return 1;
+ }
+ }
+
+ if (PCI_DMA_BUS_IS_PHYS) {
+ struct device *pdev = dev->dev.parent;
- if (dev->features & NETIF_F_HIGHDMA)
+ if (!pdev)
+ return 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+ if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+struct dev_gso_cb {
+ void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+ struct dev_gso_cb *cb;
+
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
+
+ cb = DEV_GSO_CB(skb);
+ if (cb->destructor)
+ cb->destructor(skb);
+}
+
+/**
+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
+ * @skb: buffer to segment
+ * @features: device features as applicable to this skb
+ *
+ * This function segments the given skb and stores the list of segments
+ * in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
+{
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, features);
+
+ /* Verifying header integrity only. */
+ if (!segs)
return 0;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- if (PageHighMem(skb_shinfo(skb)->frags[i].page))
- return 1;
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
+
+ skb->next = segs;
+ DEV_GSO_CB(skb)->destructor = skb->destructor;
+ skb->destructor = dev_gso_skb_destructor;
return 0;
}
+
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#ifdef CONFIG_NET_MPLS_GSO
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
#else
-#define illegal_highdma(dev, skb) (0)
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
#endif
-/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ netdev_features_t features)
{
- unsigned int size;
- u8 *data;
- long offset;
- struct skb_shared_info *ninfo;
- int headerlen = skb->data - skb->head;
- int expand = (skb->tail + skb->data_len) - skb->end;
+ int tmp;
+ __be16 type;
- if (skb_shared(skb))
- BUG();
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
- if (expand <= 0)
- expand = 0;
+ if (skb->ip_summed != CHECKSUM_NONE &&
+ !can_checksum_protocol(features, type)) {
+ features &= ~NETIF_F_ALL_CSUM;
+ } else if (illegal_highdma(skb->dev, skb)) {
+ features &= ~NETIF_F_SG;
+ }
- size = skb->end - skb->head + expand;
- size = SKB_DATA_ALIGN(size);
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
- if (!data)
- return -ENOMEM;
+ return features;
+}
- /* Copy entire thing */
- if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
- BUG();
+netdev_features_t netif_skb_features(struct sk_buff *skb)
+{
+ __be16 protocol = skb->protocol;
+ netdev_features_t features = skb->dev->features;
- /* Set up shinfo */
- ninfo = (struct skb_shared_info*)(data + size);
- atomic_set(&ninfo->dataref, 1);
- ninfo->tso_size = skb_shinfo(skb)->tso_size;
- ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
- ninfo->nr_frags = 0;
- ninfo->frag_list = NULL;
+ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ features &= ~NETIF_F_GSO_MASK;
- /* Offset between the two in bytes */
- offset = data - skb->head;
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+ } else if (!vlan_tx_tag_present(skb)) {
+ return harmonize_features(skb, features);
+ }
- /* Free old data. */
- skb_release_data(skb);
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
- skb->head = data;
- skb->end = data + size;
+ if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
+ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
- /* Set up new pointers */
- skb->h.raw += offset;
- skb->nh.raw += offset;
- skb->mac.raw += offset;
- skb->tail += offset;
- skb->data += offset;
+ return harmonize_features(skb, features);
+}
+EXPORT_SYMBOL(netif_skb_features);
- /* We are no longer a clone, even if we were. */
- skb->cloned = 0;
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int rc = NETDEV_TX_OK;
+ unsigned int skb_len;
- skb->tail += skb->data_len;
- skb->data_len = 0;
- return 0;
+ if (likely(!skb->next)) {
+ netdev_features_t features;
+
+ /*
+ * If device doesn't need skb->dst, release it right now while
+ * its hot in this cpu cache
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+
+ features = netif_skb_features(skb);
+
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (unlikely(!skb))
+ goto out;
+
+ skb->vlan_tci = 0;
+ }
+
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (netif_needs_gso(skb, features)) {
+ if (unlikely(dev_gso_segment(skb, features)))
+ goto out_kfree_skb;
+ if (skb->next)
+ goto gso;
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
+ skb_checksum_help(skb))
+ goto out_kfree_skb;
+ }
+ }
+
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
+ skb_len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = ops->ndo_start_xmit(skb, dev);
+ trace_net_dev_xmit(skb, rc, dev, skb_len);
+ if (rc == NETDEV_TX_OK)
+ txq_trans_update(txq);
+ return rc;
+ }
+
+gso:
+ do {
+ struct sk_buff *nskb = skb->next;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(nskb, dev);
+
+ skb_len = nskb->len;
+ trace_net_dev_start_xmit(nskb, dev);
+ rc = ops->ndo_start_xmit(nskb, dev);
+ trace_net_dev_xmit(nskb, rc, dev, skb_len);
+ if (unlikely(rc != NETDEV_TX_OK)) {
+ if (rc & ~NETDEV_TX_MASK)
+ goto out_kfree_gso_skb;
+ nskb->next = skb->next;
+ skb->next = nskb;
+ return rc;
+ }
+ txq_trans_update(txq);
+ if (unlikely(netif_xmit_stopped(txq) && skb->next))
+ return NETDEV_TX_BUSY;
+ } while (skb->next);
+
+out_kfree_gso_skb:
+ if (likely(skb->next == NULL)) {
+ skb->destructor = DEV_GSO_CB(skb)->destructor;
+ consume_skb(skb);
+ return rc;
+ }
+out_kfree_skb:
+ kfree_skb(skb);
+out:
+ return rc;
}
+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
-#define HARD_TX_LOCK(dev, cpu) { \
- if ((dev->features & NETIF_F_LLTX) == 0) { \
- spin_lock(&dev->xmit_lock); \
- dev->xmit_lock_owner = cpu; \
- } \
+static void qdisc_pkt_len_init(struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (shinfo->gso_size) {
+ unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
+
+ /* mac layer + network layer */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ hdr_len += tcp_hdrlen(skb);
+ else
+ hdr_len += sizeof(struct udphdr);
+
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
+ }
}
-#define HARD_TX_UNLOCK(dev) { \
- if ((dev->features & NETIF_F_LLTX) == 0) { \
- dev->xmit_lock_owner = -1; \
- spin_unlock(&dev->xmit_lock); \
- } \
+static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ spinlock_t *root_lock = qdisc_lock(q);
+ bool contended;
+ int rc;
+
+ qdisc_pkt_len_init(skb);
+ qdisc_calculate_pkt_len(skb, q);
+ /*
+ * Heuristic to force contended enqueues to serialize on a
+ * separate lock before trying to get qdisc main lock.
+ * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+ * and dequeue packets faster.
+ */
+ contended = qdisc_is_running(q);
+ if (unlikely(contended))
+ spin_lock(&q->busylock);
+
+ spin_lock(root_lock);
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ kfree_skb(skb);
+ rc = NET_XMIT_DROP;
+ } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ qdisc_run_begin(q)) {
+ /*
+ * This is a work-conserving queue; there are no old skbs
+ * waiting to be sent out; and the qdisc is not running -
+ * xmit the skb directly.
+ */
+ if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ skb_dst_force(skb);
+
+ qdisc_bstats_update(q, skb);
+
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ } else
+ qdisc_run_end(q);
+
+ rc = NET_XMIT_SUCCESS;
+ } else {
+ skb_dst_force(skb);
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+ if (qdisc_run_begin(q)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ }
+ }
+ spin_unlock(root_lock);
+ if (unlikely(contended))
+ spin_unlock(&q->busylock);
+ return rc;
}
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+
+ if (!skb->priority && skb->sk && map) {
+ unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
+ }
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
+static DEFINE_PER_CPU(int, xmit_recursion);
+#define RECURSION_LIMIT 10
+
+/**
+ * dev_loopback_xmit - loop back @skb
+ * @skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->pkt_type = PACKET_LOOPBACK;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ WARN_ON(!skb_dst(skb));
+ skb_dst_force(skb);
+ netif_rx_ni(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dev_loopback_xmit);
+
/**
- * dev_queue_xmit - transmit a buffer
+ * __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
+ * @accel_priv: private data used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
@@ -1239,78 +2857,39 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-
-int dev_queue_xmit(struct sk_buff *skb)
+static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
- if (skb_shinfo(skb)->frag_list &&
- !(dev->features & NETIF_F_FRAGLIST) &&
- __skb_linearize(skb, GFP_ATOMIC))
- goto out_kfree_skb;
+ skb_reset_mac_header(skb);
- /* Fragmented skb is linearized if device does not support SG,
- * or if at least one of fragments is in highmem and device
- * does not support DMA from it.
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
*/
- if (skb_shinfo(skb)->nr_frags &&
- (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
- __skb_linearize(skb, GFP_ATOMIC))
- goto out_kfree_skb;
+ rcu_read_lock_bh();
- /* If packet is not checksummed and device does not support
- * checksumming for this protocol, complete checksumming here.
- */
- if (skb->ip_summed == CHECKSUM_HW &&
- (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
- (!(dev->features & NETIF_F_IP_CSUM) ||
- skb->protocol != htons(ETH_P_IP))))
- if (skb_checksum_help(skb, 0))
- goto out_kfree_skb;
-
- spin_lock_prefetch(&dev->queue_lock);
+ skb_update_prio(skb);
- /* Disable soft irqs for various locks below. Also
- * stops preemption for RCU.
- */
- local_bh_disable();
-
- /* Updates of qdisc are serialized by queue_lock.
- * The struct Qdisc which is pointed to by qdisc is now a
- * rcu structure - it may be accessed without acquiring
- * a lock (but the structure may be stale.) The freeing of the
- * qdisc will be deferred until it's known that there are no
- * more references to it.
- *
- * If the qdisc has an enqueue function, we still need to
- * hold the queue_lock before calling it, since queue_lock
- * also serializes access to the device queue.
- */
+ txq = netdev_pick_tx(dev, skb, accel_priv);
+ q = rcu_dereference_bh(txq->qdisc);
- q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
+ trace_net_dev_queue(skb);
if (q->enqueue) {
- /* Grab device queue */
- spin_lock(&dev->queue_lock);
-
- rc = q->enqueue(skb, q);
-
- qdisc_run(dev);
-
- spin_unlock(&dev->queue_lock);
- rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
+ rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
- Really, it is unlikely that xmit_lock protection is necessary here.
- (f.e. loopback and IP tunnels are clean ignoring statistics
+ Really, it is unlikely that netif_tx_lock protection is necessary
+ here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
@@ -1321,139 +2900,459 @@ int dev_queue_xmit(struct sk_buff *skb)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- if (dev->xmit_lock_owner != cpu) {
+ if (txq->xmit_lock_owner != cpu) {
- HARD_TX_LOCK(dev, cpu);
+ if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+ goto recursion_alert;
- if (!netif_queue_stopped(dev)) {
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
+ HARD_TX_LOCK(dev, txq, cpu);
- rc = 0;
- if (!dev->hard_start_xmit(skb, dev)) {
- HARD_TX_UNLOCK(dev);
+ if (!netif_xmit_stopped(txq)) {
+ __this_cpu_inc(xmit_recursion);
+ rc = dev_hard_start_xmit(skb, dev, txq);
+ __this_cpu_dec(xmit_recursion);
+ if (dev_xmit_complete(rc)) {
+ HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
- HARD_TX_UNLOCK(dev);
- if (net_ratelimit())
- printk(KERN_CRIT "Virtual device %s asks to "
- "queue packet!\n", dev->name);
+ HARD_TX_UNLOCK(dev, txq);
+ net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
+ dev->name);
} else {
/* Recursion is detected! It is possible,
- * unfortunately */
- if (net_ratelimit())
- printk(KERN_CRIT "Dead loop on virtual device "
- "%s, fix it urgently!\n", dev->name);
+ * unfortunately
+ */
+recursion_alert:
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
}
}
rc = -ENETDOWN;
- local_bh_enable();
+ rcu_read_unlock_bh();
-out_kfree_skb:
+ atomic_long_inc(&dev->tx_dropped);
kfree_skb(skb);
return rc;
out:
- local_bh_enable();
+ rcu_read_unlock_bh();
return rc;
}
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ return __dev_queue_xmit(skb, NULL);
+}
+EXPORT_SYMBOL(dev_queue_xmit);
+
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+{
+ return __dev_queue_xmit(skb, accel_priv);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
/*=======================================================================
Receiver routines
=======================================================================*/
-int netdev_max_backlog = 1000;
-int netdev_budget = 300;
-int weight_p = 64; /* old backlog weight */
+int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
+int netdev_tstamp_prequeue __read_mostly = 1;
+int netdev_budget __read_mostly = 300;
+int weight_p __read_mostly = 64; /* old backlog weight */
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+ struct napi_struct *napi)
+{
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+struct static_key rps_needed __read_mostly;
+
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ if (next_cpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, next_cpu).input_queue_head;
+ }
+
+ rflow->cpu = next_cpu;
+ return rflow;
+}
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_sock_flow_table *sock_flow_table;
+ int cpu = -1;
+ u16 tcpu;
+ u32 hash;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+ goto done;
+ }
+ rxqueue = dev->_rx + index;
+ } else
+ rxqueue = dev->_rx;
+
+ map = rcu_dereference(rxqueue->rps_map);
+ if (map) {
+ if (map->len == 1 &&
+ !rcu_access_pointer(rxqueue->rps_flow_table)) {
+ tcpu = map->cpus[0];
+ if (cpu_online(tcpu))
+ cpu = tcpu;
+ goto done;
+ }
+ } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+ goto done;
+ }
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ goto done;
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ u16 next_cpu;
+ struct rps_dev_flow *rflow;
+
+ rflow = &flow_table->flows[hash & flow_table->mask];
+ tcpu = rflow->cpu;
+
+ next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = next_cpu;
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ }
+
+ if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+ if (map) {
+ tcpu = map->cpus[((u64) hash * map->len) >> 32];
+
+ if (cpu_online(tcpu)) {
+ cpu = tcpu;
+ goto done;
+ }
+ }
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+done:
+ return cpu;
+}
+#ifdef CONFIG_RFS_ACCEL
/**
- * netif_rx - post buffer to the network code
- * @skb: buffer to post
- *
- * This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
- *
- * return values:
- * NET_RX_SUCCESS (no congestion)
- * NET_RX_CN_LOW (low congestion)
- * NET_RX_CN_MOD (moderate congestion)
- * NET_RX_CN_HIGH (high congestion)
- * NET_RX_DROP (packet was dropped)
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
*
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
*/
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
-int netif_rx(struct sk_buff *skb)
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
+/* Called from hardirq (IPI) context */
+static void rps_trigger_softirq(void *data)
{
- struct softnet_data *queue;
- unsigned long flags;
+ struct softnet_data *sd = data;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
+ ____napi_schedule(sd, &sd->backlog);
+ sd->received_rps++;
+}
- if (!skb->tstamp.off_sec)
- net_timestamp(skb);
+#endif /* CONFIG_RPS */
+
+/*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+
+ if (sd != mysd) {
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return 1;
+ }
+#endif /* CONFIG_RPS */
+ return 0;
+}
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = &__get_cpu_var(softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int qlen;
+
+ sd = &per_cpu(softnet_data, cpu);
- /*
- * The code is rearranged so that the path is the most
- * short when CPU is congested, but is still operating.
- */
local_irq_save(flags);
- queue = &__get_cpu_var(softnet_data);
- __get_cpu_var(netdev_rx_stat).total++;
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
+ rps_lock(sd);
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
- dev_hold(skb->dev);
- __skb_queue_tail(&queue->input_pkt_queue, skb);
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ input_queue_tail_incr_save(sd, qtail);
+ rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
- netif_rx_schedule(&queue->backlog_dev);
+ /* Schedule NAPI for backlog device
+ * We can use non atomic operation since we own the queue lock
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
+ if (!rps_ipi_queued(sd))
+ ____napi_schedule(sd, &sd->backlog);
+ }
goto enqueue;
}
- __get_cpu_var(netdev_rx_stat).dropped++;
+ sd->dropped++;
+ rps_unlock(sd);
+
local_irq_restore(flags);
+ atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
+static int netif_rx_internal(struct sk_buff *skb)
+{
+ int ret;
+
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ trace_netif_rx(skb);
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+ if (cpu < 0)
+ cpu = smp_processor_id();
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+ rcu_read_unlock();
+ preempt_enable();
+ } else
+#endif
+ {
+ unsigned int qtail;
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+ put_cpu();
+ }
+ return ret;
+}
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ trace_netif_rx_entry(skb);
+
+ return netif_rx_internal(skb);
+}
+EXPORT_SYMBOL(netif_rx);
+
int netif_rx_ni(struct sk_buff *skb)
{
int err;
+ trace_netif_rx_ni_entry(skb);
+
preempt_disable();
- err = netif_rx(skb);
+ err = netif_rx_internal(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
return err;
}
-
EXPORT_SYMBOL(netif_rx_ni);
-static inline struct net_device *skb_bond(struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
-
- if (dev->master)
- skb->dev = dev->master;
-
- return dev;
-}
-
static void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
@@ -1470,70 +3369,57 @@ static void net_tx_action(struct softirq_action *h)
struct sk_buff *skb = clist;
clist = clist->next;
- BUG_TRAP(!atomic_read(&skb->users));
+ WARN_ON(atomic_read(&skb->users));
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
if (sd->output_queue) {
- struct net_device *head;
+ struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
while (head) {
- struct net_device *dev = head;
- head = head->next_sched;
+ struct Qdisc *q = head;
+ spinlock_t *root_lock;
- smp_mb__before_clear_bit();
- clear_bit(__LINK_STATE_SCHED, &dev->state);
+ head = head->next_sched;
- if (spin_trylock(&dev->queue_lock)) {
- qdisc_run(dev);
- spin_unlock(&dev->queue_lock);
+ root_lock = qdisc_lock(q);
+ if (spin_trylock(root_lock)) {
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ qdisc_run(q);
+ spin_unlock(root_lock);
} else {
- netif_schedule(dev);
+ if (!test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state)) {
+ __netif_reschedule(q);
+ } else {
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ }
}
}
}
}
-static __inline__ int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
-{
- atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-}
-
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
-struct net_bridge;
-struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
- unsigned char *addr);
-void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
-
-static __inline__ int handle_bridge(struct sk_buff **pskb,
- struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
-{
- struct net_bridge_port *port;
-
- if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
- (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
- return 0;
-
- if (*pt_prev) {
- *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- }
-
- return br_handle_frame_hook(port, pskb);
-}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+ (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+/* This hook is defined here for ATM LANE */
+int (*br_fdb_test_addr_hook)(struct net_device *dev,
+ unsigned char *addr) __read_mostly;
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
#ifdef CONFIG_NET_CLS_ACT
@@ -1541,67 +3427,172 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
* a compare and 2 stores extra right now if we dont have it on
* but have CONFIG_NET_CLS_ACT
- * NOTE: This doesnt stop any functionality; if you dont have
- * the ingress scheduler, you just cant add policies on ingress.
+ * NOTE: This doesn't stop any functionality; if you dont have
+ * the ingress scheduler, you just can't add policies on ingress.
*
*/
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
{
- struct Qdisc *q;
struct net_device *dev = skb->dev;
+ u32 ttl = G_TC_RTTL(skb->tc_verd);
int result = TC_ACT_OK;
-
- if (dev->qdisc_ingress) {
- __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
- if (MAX_RED_LOOP < ttl++) {
- printk("Redir loop detected Dropping packet (%s->%s)\n",
- skb->input_dev->name, skb->dev->name);
- return TC_ACT_SHOT;
- }
-
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
+ struct Qdisc *q;
- skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
+ if (unlikely(MAX_RED_LOOP < ttl++)) {
+ net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
+ skb->skb_iif, dev->ifindex);
+ return TC_ACT_SHOT;
+ }
- spin_lock(&dev->ingress_lock);
- if ((q = dev->qdisc_ingress) != NULL)
- result = q->enqueue(skb, q);
- spin_unlock(&dev->ingress_lock);
+ skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+ q = rxq->qdisc;
+ if (q != &noop_qdisc) {
+ spin_lock(qdisc_lock(q));
+ if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
+ result = qdisc_enqueue_root(skb, q);
+ spin_unlock(qdisc_lock(q));
}
return result;
}
+
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+ if (!rxq || rxq->qdisc == &noop_qdisc)
+ goto out;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ switch (ing_filter(skb, rxq)) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ kfree_skb(skb);
+ return NULL;
+ }
+
+out:
+ skb->tc_verd = 0;
+ return skb;
+}
#endif
-int netif_receive_skb(struct sk_buff *skb)
+/**
+ * netdev_rx_handler_register - register receive handler
+ * @dev: device to register a handler for
+ * @rx_handler: receive handler to register
+ * @rx_handler_data: data pointer that is used by rx handler
+ *
+ * Register a receive handler for a device. This handler will then be
+ * called from __netif_receive_skb. A negative errno code is returned
+ * on a failure.
+ *
+ * The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+ rx_handler_func_t *rx_handler,
+ void *rx_handler_data)
+{
+ ASSERT_RTNL();
+
+ if (dev->rx_handler)
+ return -EBUSY;
+
+ /* Note: rx_handler_data must be set before rx_handler */
+ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+ rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ * netdev_rx_handler_unregister - unregister receive handler
+ * @dev: device to unregister a handler from
+ *
+ * Unregister a receive handler from a device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+ ASSERT_RTNL();
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
+ rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
+ struct net_device *null_or_dev;
+ bool deliver_exact = false;
int ret = NET_RX_DROP;
- unsigned short type;
-
- /* if we've gotten here through NAPI, check netpoll */
- if (skb->dev->poll && netpoll_rx(skb))
- return NET_RX_DROP;
+ __be16 type;
- if (!skb->tstamp.off_sec)
- net_timestamp(skb);
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
- if (!skb->input_dev)
- skb->input_dev = skb->dev;
+ trace_netif_receive_skb(skb);
- orig_dev = skb_bond(skb);
+ orig_dev = skb->dev;
- __get_cpu_var(netdev_rx_stat).total++;
-
- skb->h.raw = skb->nh.raw = skb->data;
- skb->mac_len = skb->nh.raw - skb->mac.raw;
+ skb_reset_network_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
pt_prev = NULL;
rcu_read_lock();
+another_round:
+ skb->skb_iif = skb->dev->ifindex;
+
+ __this_cpu_inc(softnet_data.processed);
+
+ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ skb = vlan_untag(skb);
+ if (unlikely(!skb))
+ goto unlock;
+ }
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -1609,51 +3600,93 @@ int netif_receive_skb(struct sk_buff *skb)
}
#endif
+ if (pfmemalloc)
+ goto skip_taps;
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
- if (pt_prev)
+ if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
+skip_taps:
#ifdef CONFIG_NET_CLS_ACT
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL; /* noone else should process this after*/
- } else {
- skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
- }
+ skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto unlock;
+ncls:
+#endif
- ret = ing_filter(skb);
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ goto drop;
- if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
- kfree_skb(skb);
- goto out;
+ if (vlan_tx_tag_present(skb)) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_do_receive(&skb))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto unlock;
}
- skb->tc_verd = 0;
-ncls:
-#endif
+ rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (rx_handler) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
+ goto unlock;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
+ }
- handle_diverter(skb);
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ if (vlan_tx_tag_get_id(skb))
+ skb->pkt_type = PACKET_OTHERHOST;
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ skb->vlan_tci = 0;
+ }
- if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
- goto out;
+ /* deliver only exact match when indicated */
+ null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol;
- list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
+ list_for_each_entry_rcu(ptype,
+ &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
- (!ptype->dev || ptype->dev == skb->dev)) {
- if (pt_prev)
+ (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev)) {
+ if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ goto drop;
+ else
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
+drop:
+ atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
@@ -1661,439 +3694,1568 @@ ncls:
ret = NET_RX_DROP;
}
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
+
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned long pflags = current->flags;
+
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ current->flags |= PF_MEMALLOC;
+ ret = __netif_receive_skb_core(skb, true);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ } else
+ ret = __netif_receive_skb_core(skb, false);
+
+ return ret;
+}
+
+static int netif_receive_skb_internal(struct sk_buff *skb)
+{
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ if (skb_defer_rx_timestamp(skb))
+ return NET_RX_SUCCESS;
+
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu, ret;
+
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+ }
+#endif
+ return __netif_receive_skb(skb);
+}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ trace_netif_receive_skb_entry(skb);
+
+ return netif_receive_skb_internal(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb);
+
+/* Network device is going away, flush any packets still pending
+ * Called with irqs disabled.
+ */
+static void flush_backlog(void *arg)
+{
+ struct net_device *dev = arg;
+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
+ struct sk_buff *skb, *tmp;
+
+ rps_lock(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &sd->input_pkt_queue);
+ kfree_skb(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+ rps_unlock(sd);
+
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &sd->process_queue);
+ kfree_skb(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+}
+
+static int napi_gro_complete(struct sk_buff *skb)
+{
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = ptype->callbacks.gro_complete(skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
out:
+ return netif_receive_skb_internal(skb);
+}
+
+/* napi->gro_list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ struct sk_buff *skb, *prev = NULL;
+
+ /* scan list and build reverse chain */
+ for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
+ skb->prev = prev;
+ prev = skb;
+ }
+
+ for (skb = prev; skb; skb = prev) {
+ skb->next = NULL;
+
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+
+ prev = skb->prev;
+ napi_gro_complete(skb);
+ napi->gro_count--;
+ }
+
+ napi->gro_list = NULL;
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct sk_buff *p;
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+
+ for (p = napi->gro_list; p; p = p->next) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+ pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int same_flow;
+ enum gro_result ret;
+ int grow;
+
+ if (!(skb->dev->features & NETIF_F_GRO))
+ goto normal;
+
+ if (skb_is_gso(skb) || skb_has_frag_list(skb))
+ goto normal;
+
+ gro_list_prepare(napi, skb);
+ NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->udp_mark = 0;
+
+ pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+ break;
+ }
rcu_read_unlock();
+
+ if (&ptype->list == head)
+ goto normal;
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ struct sk_buff *nskb = *pp;
+
+ *pp = nskb->next;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ napi->gro_count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
+ struct sk_buff *nskb = napi->gro_list;
+
+ /* locate the end of the list to select the 'oldest' flow */
+ while (nskb->next) {
+ pp = &nskb->next;
+ nskb = *pp;
+ }
+ *pp = NULL;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ } else {
+ napi->gro_count++;
+ }
+ NAPI_GRO_CB(skb)->count = 1;
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ skb->next = napi->gro_list;
+ napi->gro_list = skb;
+ ret = GRO_HELD;
+
+pull:
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+ok:
return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ goto pull;
}
-static int process_backlog(struct net_device *backlog_dev, int *budget)
+struct packet_offload *gro_find_receive_by_type(__be16 type)
{
- int work = 0;
- int quota = min(backlog_dev->quota, *budget);
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
- unsigned long start_time = jiffies;
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
- backlog_dev->weight = weight_p;
- for (;;) {
- struct sk_buff *skb;
- struct net_device *dev;
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
- local_irq_disable();
- skb = __skb_dequeue(&queue->input_pkt_queue);
- if (!skb)
- goto job_done;
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ if (netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ kfree_skb(skb);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ __kfree_skb(skb);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb);
+
+ return napi_skb_finish(dev_gro_receive(napi, skb), skb);
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+ napi->skb = skb;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ case GRO_MERGED_FREE:
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ if (!skb)
+ return GRO_DROP;
+
+ trace_napi_gro_frags_entry(skb);
+
+ return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/*
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+ if (remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+
+ /* Send pending IPI's to kick RPS processing on remote cpus. */
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
+ remsd = next;
+ }
+ } else
+#endif
local_irq_enable();
+}
- dev = skb->dev;
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+ int work = 0;
+ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
- netif_receive_skb(skb);
+#ifdef CONFIG_RPS
+ /* Check if we have pending ipi, its better to send them now,
+ * not waiting net_rx_action() end.
+ */
+ if (sd->rps_ipi_list) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+#endif
+ napi->weight = weight_p;
+ local_irq_disable();
+ while (1) {
+ struct sk_buff *skb;
- dev_put(dev);
+ while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_irq_enable();
+ __netif_receive_skb(skb);
+ local_irq_disable();
+ input_queue_head_incr(sd);
+ if (++work >= quota) {
+ local_irq_enable();
+ return work;
+ }
+ }
- work++;
+ rps_lock(sd);
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
+ /*
+ * Inline a custom version of __napi_complete().
+ * only current cpu owns and manipulates this napi,
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+ list_del(&napi->poll_list);
+ napi->state = 0;
+ rps_unlock(sd);
- if (work >= quota || jiffies - start_time > 1)
break;
+ }
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
+ rps_unlock(sd);
}
+ local_irq_enable();
- backlog_dev->quota -= work;
- *budget -= work;
- return -1;
+ return work;
+}
-job_done:
- backlog_dev->quota -= work;
- *budget -= work;
+/**
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run
+ */
+void __napi_schedule(struct napi_struct *n)
+{
+ unsigned long flags;
- list_del(&backlog_dev->poll_list);
- smp_mb__before_clear_bit();
- netif_poll_enable(backlog_dev);
+ local_irq_save(flags);
+ ____napi_schedule(&__get_cpu_var(softnet_data), n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
- local_irq_enable();
- return 0;
+void __napi_complete(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ BUG_ON(n->gro_list);
+
+ list_del(&n->poll_list);
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+}
+EXPORT_SYMBOL(__napi_complete);
+
+void napi_complete(struct napi_struct *n)
+{
+ unsigned long flags;
+
+ /*
+ * don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu
+ */
+ if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+ return;
+
+ napi_gro_flush(n, false);
+ local_irq_save(flags);
+ __napi_complete(n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(napi_complete);
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(napi_by_id);
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0 is not a valid id, we also skip an id that is taken
+ * we expect both events to be extremely rare
+ */
+ napi->napi_id = 0;
+ while (!napi->napi_id) {
+ napi->napi_id = ++napi_gen_id;
+ if (napi_by_id(napi->napi_id))
+ napi->napi_id = 0;
+ }
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
+ }
}
+EXPORT_SYMBOL_GPL(napi_hash_add);
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+void napi_hash_del(struct napi_struct *napi)
+{
+ spin_lock(&napi_hash_lock);
+
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ hlist_del_rcu(&napi->napi_hash_node);
+
+ spin_unlock(&napi_hash_lock);
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
+{
+ INIT_LIST_HEAD(&napi->poll_list);
+ napi->gro_count = 0;
+ napi->gro_list = NULL;
+ napi->skb = NULL;
+ napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
+ napi->weight = weight;
+ list_add(&napi->dev_list, &dev->napi_list);
+ napi->dev = dev;
+#ifdef CONFIG_NETPOLL
+ spin_lock_init(&napi->poll_lock);
+ napi->poll_owner = -1;
+#endif
+ set_bit(NAPI_STATE_SCHED, &napi->state);
+}
+EXPORT_SYMBOL(netif_napi_add);
+
+void netif_napi_del(struct napi_struct *napi)
+{
+ list_del_init(&napi->dev_list);
+ napi_free_frags(napi);
+
+ kfree_skb_list(napi->gro_list);
+ napi->gro_list = NULL;
+ napi->gro_count = 0;
+}
+EXPORT_SYMBOL(netif_napi_del);
static void net_rx_action(struct softirq_action *h)
{
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
- unsigned long start_time = jiffies;
+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
+ unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
- while (!list_empty(&queue->poll_list)) {
- struct net_device *dev;
+ while (!list_empty(&sd->poll_list)) {
+ struct napi_struct *n;
+ int work, weight;
- if (budget <= 0 || jiffies - start_time > 1)
+ /* If softirq window is exhuasted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
- dev = list_entry(queue->poll_list.next,
- struct net_device, poll_list);
- have = netpoll_poll_lock(dev);
+ /* Even though interrupts have been re-enabled, this
+ * access is safe because interrupts can only add new
+ * entries to the tail of this list, and only ->poll()
+ * calls can remove this head entry from the list.
+ */
+ n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
- if (dev->quota <= 0 || dev->poll(dev, &budget)) {
- netpoll_poll_unlock(have);
- local_irq_disable();
- list_del(&dev->poll_list);
- list_add_tail(&dev->poll_list, &queue->poll_list);
- if (dev->quota < 0)
- dev->quota += dev->weight;
- else
- dev->quota = dev->weight;
- } else {
- netpoll_poll_unlock(have);
- dev_put(dev);
- local_irq_disable();
+ have = netpoll_poll_lock(n);
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n);
}
+
+ WARN_ON_ONCE(work > weight);
+
+ budget -= work;
+
+ local_irq_disable();
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(work == weight)) {
+ if (unlikely(napi_disable_pending(n))) {
+ local_irq_enable();
+ napi_complete(n);
+ local_irq_disable();
+ } else {
+ if (n->gro_list) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ local_irq_enable();
+ napi_gro_flush(n, HZ >= 1000);
+ local_irq_disable();
+ }
+ list_move_tail(&n->poll_list, &sd->poll_list);
+ }
+ }
+
+ netpoll_poll_unlock(have);
}
out:
- local_irq_enable();
+ net_rps_action_and_irq_enable(sd);
+
+#ifdef CONFIG_NET_DMA
+ /*
+ * There may not be any more sk_buffs coming right now, so push
+ * any pending DMA copies to hardware
+ */
+ dma_issue_pending_all();
+#endif
+
return;
softnet_break:
- __get_cpu_var(netdev_rx_stat).time_squeeze++;
+ sd->time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
-static gifconf_func_t * gifconf_list [NPROTO];
+struct netdev_adjacent {
+ struct net_device *dev;
+
+ /* upper master flag, there can only be one master device per list */
+ bool master;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
+
+ /* private field for the users */
+ void *private;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
+ }
+ return NULL;
+}
/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
*
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
*/
-int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
+ ASSERT_RTNL();
+ return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev);
-/*
- * Map an interface index to its name (SIOCGIFNAME)
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
*/
+static bool netdev_has_any_upper_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
-/*
- * We need this ioctl for efficient implementation of the
- * if_indextoname() function required by the IPv6 API. Without
- * it, we would have to search all the interfaces to find a
- * match. --pb
+ return !list_empty(&dev->all_adj_list.upper);
+}
+
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
*/
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
-static int dev_ifname(struct ifreq __user *arg)
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
+
+void *netdev_adjacent_get_private(struct list_head *adj_list)
{
- struct net_device *dev;
- struct ifreq ifr;
+ struct netdev_adjacent *adj;
- /*
- * Fetch the caller's info block.
- */
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+ return adj->private;
+}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifr.ifr_ifindex);
- if (!dev) {
- read_unlock(&dev_base_lock);
- return -ENODEV;
- }
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
- strcpy(ifr.ifr_name, dev->name);
- read_unlock(&dev_base_lock);
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
- if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
- return -EFAULT;
- return 0;
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-/*
- * Perform a SIOCGIFCONF call. This structure will change
- * size eventually, and there is nothing I can do about it.
- * Thus we will need a 'compatibility mode'.
+/**
+ * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
*/
+struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->all_adj_list.upper)
+ return NULL;
-static int dev_ifconf(char __user *arg)
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
{
- struct ifconf ifc;
- struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
+ struct netdev_adjacent *lower;
- /*
- * Fetch the caller's info block.
- */
+ lower = list_entry(*iter, struct netdev_adjacent, list);
- if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
- return -EFAULT;
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
- pos = ifc.ifc_buf;
- len = ifc.ifc_len;
+ *iter = lower->list.next;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
- total = 0;
- for (dev = dev_base; dev; dev = dev->next) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
- }
- }
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc.ifc_len = total;
+ WARN_ON_ONCE(!rcu_read_lock_held());
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
- return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->private;
}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
-#ifdef CONFIG_PROC_FS
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
*/
-static __inline__ struct net_device *dev_get_idx(loff_t pos)
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
- struct net_device *dev;
- loff_t i;
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
- for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
- return i == pos ? dev : NULL;
+ *iter = &lower->list;
+
+ return lower->dev;
}
+EXPORT_SYMBOL(netdev_lower_get_next);
-void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
- read_lock(&dev_base_lock);
- return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
-void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
+ struct netdev_adjacent *upper;
+
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
+ return NULL;
}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
-void dev_seq_stop(struct seq_file *seq, void *v)
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
{
- read_unlock(&dev_base_lock);
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
}
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
+}
+
+#define netdev_adjacent_is_neigh_list(dev, dev_list) \
+ (dev_list == &dev->adj_list.upper || \
+ dev_list == &dev->adj_list.lower)
-static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
{
- if (dev->get_stats) {
- struct net_device_stats *stats = dev->get_stats(dev);
+ struct netdev_adjacent *adj;
+ int ret;
- seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
- "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
- dev->name, stats->rx_bytes, stats->rx_packets,
- stats->rx_errors,
- stats->rx_dropped + stats->rx_missed_errors,
- stats->rx_fifo_errors,
- stats->rx_length_errors + stats->rx_over_errors +
- stats->rx_crc_errors + stats->rx_frame_errors,
- stats->rx_compressed, stats->multicast,
- stats->tx_bytes, stats->tx_packets,
- stats->tx_errors, stats->tx_dropped,
- stats->tx_fifo_errors, stats->collisions,
- stats->tx_carrier_errors +
- stats->tx_aborted_errors +
- stats->tx_window_errors +
- stats->tx_heartbeat_errors,
- stats->tx_compressed);
- } else
- seq_printf(seq, "%6s: No statistics available.\n", dev->name);
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr++;
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ dev_hold(adj_dev);
+
+ pr_debug("dev_hold for %s, because of link added from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
+ if (master) {
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
+ }
+
+ return 0;
+
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ kfree(adj);
+ dev_put(adj_dev);
+
+ return ret;
}
-/*
- * Called from the PROCfs module. This now uses the new arbitrary sized
- * /proc/net interface to create /proc/net/dev
- */
-static int dev_seq_show(struct seq_file *seq, void *v)
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Inter-| Receive "
- " | Transmit\n"
- " face |bytes packets errs drop fifo frame "
- "compressed multicast|bytes packets errs "
- "drop fifo colls carrier compressed\n");
- else
- dev_seq_printf_stats(seq, v);
- return 0;
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (!adj) {
+ pr_err("tried to remove device %s from %s\n",
+ dev->name, adj_dev->name);
+ BUG();
+ }
+
+ if (adj->ref_nr > 1) {
+ pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
+ adj->ref_nr-1);
+ adj->ref_nr--;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
+
+ if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
}
-static struct netif_rx_stats *softnet_get_online(loff_t *pos)
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
{
- struct netif_rx_stats *rc = NULL;
+ int ret;
- while (*pos < NR_CPUS)
- if (cpu_online(*pos)) {
- rc = &per_cpu(netdev_rx_stat, *pos);
- break;
- } else
- ++*pos;
- return rc;
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
+ master);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
+ false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ return ret;
+ }
+
+ return 0;
}
-static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+static int __netdev_adjacent_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- return softnet_get_online(pos);
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower,
+ NULL, false);
}
-static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list)
{
- ++*pos;
- return softnet_get_online(pos);
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
}
-static void softnet_seq_stop(struct seq_file *seq, void *v)
+static void __netdev_adjacent_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower);
}
-static int softnet_seq_show(struct seq_file *seq, void *v)
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
{
- struct netif_rx_stats *s = v;
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+ if (ret) {
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ return ret;
+ }
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- s->total, s->dropped, s->time_squeeze, 0,
- 0, 0, 0, 0, /* was fastroute */
- s->cpu_collision );
return 0;
}
-static struct seq_operations dev_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = dev_seq_show,
-};
-
-static int dev_seq_open(struct inode *inode, struct file *file)
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
{
- return seq_open(file, &dev_seq_ops);
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
}
-static struct file_operations dev_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master,
+ void *private)
+{
+ struct netdev_adjacent *i, *j, *to_i, *to_j;
+ int ret = 0;
-static struct seq_operations softnet_seq_ops = {
- .start = softnet_seq_start,
- .next = softnet_seq_next,
- .stop = softnet_seq_stop,
- .show = softnet_seq_show,
-};
+ ASSERT_RTNL();
-static int softnet_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &softnet_seq_ops);
-}
+ if (dev == upper_dev)
+ return -EBUSY;
-static struct file_operations softnet_seq_fops = {
- .owner = THIS_MODULE,
- .open = softnet_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+ return -EBUSY;
-#ifdef WIRELESS_EXT
-extern int wireless_proc_init(void);
-#else
-#define wireless_proc_init() 0
-#endif
+ if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
+ return -EEXIST;
+
+ if (master && netdev_master_upper_dev_get(dev))
+ return -EBUSY;
+
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ master);
+ if (ret)
+ return ret;
+
+ /* Now that we linked these devs, make all the upper_dev's
+ * all_adj_list.upper visible to every dev's all_adj_list.lower an
+ * versa, and don't forget the devices itself. All of these
+ * links are non-neighbours.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("Interlinking %s with %s, non-neighbour\n",
+ i->dev->name, j->dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ if (ret)
+ goto rollback_mesh;
+ }
+ }
+
+ /* add dev to every upper_dev's upper device */
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("linking %s's upper device %s with %s\n",
+ upper_dev->name, i->dev->name, dev->name);
+ ret = __netdev_adjacent_dev_link(dev, i->dev);
+ if (ret)
+ goto rollback_upper_mesh;
+ }
+
+ /* add upper_dev to every dev's lower device */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ pr_debug("linking %s's lower device %s with %s\n", dev->name,
+ i->dev->name, upper_dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ if (ret)
+ goto rollback_lower_mesh;
+ }
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+ return 0;
+
+rollback_lower_mesh:
+ to_i = i;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ }
+
+ i = NULL;
+
+rollback_upper_mesh:
+ to_i = i;
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+ }
+
+ i = j = NULL;
+
+rollback_mesh:
+ to_i = i;
+ to_j = j;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i && j == to_j)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ }
+ if (i == to_i)
+ break;
+ }
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
+}
-static int __init dev_proc_init(void)
+/**
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- int rc = -ENOMEM;
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+}
+EXPORT_SYMBOL(netdev_upper_dev_link);
- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
- goto out;
- if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
- goto out_dev;
- if (wireless_proc_init())
- goto out_softnet;
- rc = 0;
-out:
- return rc;
-out_softnet:
- proc_net_remove("softnet_stat");
-out_dev:
- proc_net_remove("dev");
- goto out;
+/**
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
+ */
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
}
-#else
-#define dev_proc_init() 0
-#endif /* CONFIG_PROC_FS */
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
+int netdev_master_upper_dev_link_private(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, private);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
/**
- * netdev_set_master - set up master/slave pair
- * @slave: slave device
- * @master: new master device
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
*
- * Changes the master device of the slave. Pass %NULL to break the
- * bonding. The caller must hold the RTNL semaphore. On a failure
- * a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
*/
-int netdev_set_master(struct net_device *slave, struct net_device *master)
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_adjacent *i, *j;
+ ASSERT_RTNL();
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ /* Here is the tricky part. We must remove all dev's lower
+ * devices from all upper_dev's upper devices and vice
+ * versa, to maintain the graph relationship.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+ /* remove also the devices itself from lower/upper device
+ * list
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+}
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
+
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+ struct netdev_adjacent *iter;
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
+
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
+{
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
{
- struct net_device *old = slave->master;
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
ASSERT_RTNL();
- if (master) {
- if (old)
- return -EBUSY;
- dev_hold(master);
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
}
- slave->master = master;
-
- synchronize_net();
+ if (type_check(dev))
+ max_nest++;
- if (old)
- dev_put(old);
+ return max_nest;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
- if (master)
- slave->flags |= IFF_SLAVE;
- else
- slave->flags &= ~IFF_SLAVE;
+static void dev_change_rx_flags(struct net_device *dev, int flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
- rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
+ if (ops->ndo_change_rx_flags)
+ ops->ndo_change_rx_flags(dev, flags);
+}
+
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags;
+ kuid_t uid;
+ kgid_t gid;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_PROMISC;
+ dev->promiscuity += inc;
+ if (dev->promiscuity == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch promisc and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_PROMISC;
+ else {
+ dev->promiscuity -= inc;
+ pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags != old_flags) {
+ pr_info("device %s %s promiscuous mode\n",
+ dev->name,
+ dev->flags & IFF_PROMISC ? "entered" : "left");
+ if (audit_enabled) {
+ current_uid_gid(&uid, &gid);
+ audit_log(current->audit_context, GFP_ATOMIC,
+ AUDIT_ANOM_PROMISCUOUS,
+ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+ dev->name, (dev->flags & IFF_PROMISC),
+ (old_flags & IFF_PROMISC),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ audit_get_sessionid(current));
+ }
+
+ dev_change_rx_flags(dev, IFF_PROMISC);
+ }
+ if (notify)
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC);
return 0;
}
@@ -2102,25 +5264,56 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
* @dev: device
* @inc: modifier
*
- * Add or remove promsicuity from a device. While the count in the device
+ * Add or remove promiscuity from a device. While the count in the device
* remains above zero the interface remains promiscuous. Once it hits zero
* the device reverts back to normal filtering operation. A negative inc
* value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
*/
-void dev_set_promiscuity(struct net_device *dev, int inc)
+int dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
+ int err;
- if ((dev->promiscuity += inc) == 0)
- dev->flags &= ~IFF_PROMISC;
- else
- dev->flags |= IFF_PROMISC;
- if (dev->flags != old_flags) {
- dev_mc_upload(dev);
- printk(KERN_INFO "device %s %s promiscuous mode\n",
- dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
- "left");
+ err = __dev_set_promiscuity(dev, inc, true);
+ if (err < 0)
+ return err;
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_ALLMULTI;
+ dev->allmulti += inc;
+ if (dev->allmulti == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch allmulti and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_ALLMULTI;
+ else {
+ dev->allmulti -= inc;
+ pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags ^ old_flags) {
+ dev_change_rx_flags(dev, IFF_ALLMULTI);
+ dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags);
}
+ return 0;
}
/**
@@ -2133,39 +5326,93 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
* to all interfaces. Once it hits zero the device reverts back to normal
* filtering operation. A negative @inc value is used to drop the counter
* when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
*/
-void dev_set_allmulti(struct net_device *dev, int inc)
+int dev_set_allmulti(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ return __dev_set_allmulti(dev, inc, true);
+}
+EXPORT_SYMBOL(dev_set_allmulti);
- dev->flags |= IFF_ALLMULTI;
- if ((dev->allmulti += inc) == 0)
- dev->flags &= ~IFF_ALLMULTI;
- if (dev->flags ^ old_flags)
- dev_mc_upload(dev);
+/*
+ * Upload unicast and multicast address lists to device and
+ * configure RX filtering. When the device doesn't support unicast
+ * filtering it is put in promiscuous mode while unicast addresses
+ * are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* Unicast addresses changes may only happen under the rtnl,
+ * therefore calling __dev_set_promiscuity here is safe.
+ */
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
+ } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
+ }
+ }
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
}
-unsigned dev_get_flags(const struct net_device *dev)
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+/**
+ * dev_get_flags - get flags reported to userspace
+ * @dev: device
+ *
+ * Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned int dev_get_flags(const struct net_device *dev)
{
- unsigned flags;
+ unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
- IFF_RUNNING)) |
+ IFF_RUNNING |
+ IFF_LOWER_UP |
+ IFF_DORMANT)) |
(dev->gflags & (IFF_PROMISC |
IFF_ALLMULTI));
- if (netif_running(dev) && netif_carrier_ok(dev))
- flags |= IFF_RUNNING;
+ if (netif_running(dev)) {
+ if (netif_oper_up(dev))
+ flags |= IFF_RUNNING;
+ if (netif_carrier_ok(dev))
+ flags |= IFF_LOWER_UP;
+ if (netif_dormant(dev))
+ flags |= IFF_DORMANT;
+ }
return flags;
}
+EXPORT_SYMBOL(dev_get_flags);
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
+ unsigned int old_flags = dev->flags;
int ret;
- int old_flags = dev->flags;
+
+ ASSERT_RTNL();
/*
* Set the flags on our device.
@@ -2181,7 +5428,10 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
* Load in the correct multicast list now the flags have changed.
*/
- dev_mc_upload(dev);
+ if ((old_flags ^ flags) & IFF_MULTICAST)
+ dev_change_rx_flags(dev, IFF_MULTICAST);
+
+ dev_set_rx_mode(dev);
/*
* Have we downed the interface. We handle IFF_UP ourselves
@@ -2191,21 +5441,21 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
ret = 0;
if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
- ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
+ ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
if (!ret)
- dev_mc_upload(dev);
+ dev_set_rx_mode(dev);
}
- if (dev->flags & IFF_UP &&
- ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
- IFF_VOLATILE)))
- notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
-
if ((flags ^ dev->gflags) & IFF_PROMISC) {
- int inc = (flags & IFF_PROMISC) ? +1 : -1;
+ int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ unsigned int old_flags = dev->flags;
+
dev->gflags ^= IFF_PROMISC;
- dev_set_promiscuity(dev, inc);
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
@@ -2213,20 +5463,84 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
- int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
+ int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
+
dev->gflags ^= IFF_ALLMULTI;
- dev_set_allmulti(dev, inc);
+ __dev_set_allmulti(dev, inc, false);
+ }
+
+ return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges)
+{
+ unsigned int changes = dev->flags ^ old_flags;
+
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+
+ if (changes & IFF_UP) {
+ if (dev->flags & IFF_UP)
+ call_netdevice_notifiers(NETDEV_UP, dev);
+ else
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ }
+
+ if (dev->flags & IFF_UP &&
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = changes;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
}
+}
+
+/**
+ * dev_change_flags - change device settings
+ * @dev: device
+ * @flags: device state flags
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+ int ret;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
- if (old_flags ^ dev->flags)
- rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
+ ret = __dev_change_flags(dev, flags);
+ if (ret < 0)
+ return ret;
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes);
return ret;
}
+EXPORT_SYMBOL(dev_change_flags);
+
+static int __dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+/**
+ * dev_set_mtu - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ *
+ * Change the maximum transfer size of the network device.
+ */
int dev_set_mtu(struct net_device *dev, int new_mtu)
{
- int err;
+ int err, orig_mtu;
if (new_mtu == dev->mtu)
return 0;
@@ -2238,405 +5552,444 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (!netif_device_present(dev))
return -ENODEV;
- err = 0;
- if (dev->change_mtu)
- err = dev->change_mtu(dev, new_mtu);
- else
- dev->mtu = new_mtu;
- if (!err && dev->flags & IFF_UP)
- notifier_call_chain(&netdev_chain,
- NETDEV_CHANGEMTU, dev);
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ orig_mtu = dev->mtu;
+ err = __dev_set_mtu(dev, new_mtu);
+
+ if (!err) {
+ err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __dev_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ }
+ }
return err;
}
+EXPORT_SYMBOL(dev_set_mtu);
+/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
+ * dev_set_mac_address - Change Media Access Control Address
+ * @dev: device
+ * @sa: new address
+ *
+ * Change the hardware (MAC) address of the device
+ */
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
{
+ const struct net_device_ops *ops = dev->netdev_ops;
int err;
- if (!dev->set_mac_address)
+ if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
if (sa->sa_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev->set_mac_address(dev, sa);
- if (!err)
- notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
- return err;
+ err = ops->ndo_set_mac_address(dev, sa);
+ if (err)
+ return err;
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ return 0;
}
+EXPORT_SYMBOL(dev_set_mac_address);
-/*
- * Perform the SIOCxIFxxx calls.
+/**
+ * dev_change_carrier - Change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
*/
-static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
- int err;
- struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
+ const struct net_device_ops *ops = dev->netdev_ops;
- if (!dev)
+ if (!ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
+}
+EXPORT_SYMBOL(dev_change_carrier);
- switch (cmd) {
- case SIOCGIFFLAGS: /* Get interface flags */
- ifr->ifr_flags = dev_get_flags(dev);
- return 0;
+/**
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
+ *
+ * Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_port_id *ppid)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
- case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
- case SIOCGIFMETRIC: /* Get the metric on the interface
- (currently unused) */
- ifr->ifr_metric = 0;
- return 0;
+/**
+ * dev_new_index - allocate an ifindex
+ * @net: the applicable net namespace
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+ int ifindex = net->ifindex;
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex = 1;
+ if (!__dev_get_by_index(net, ifindex))
+ return net->ifindex = ifindex;
+ }
+}
- case SIOCSIFMETRIC: /* Set the metric on the interface
- (currently unused) */
- return -EOPNOTSUPP;
+/* Delayed registration/unregisteration */
+static LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
- case SIOCGIFMTU: /* Get the MTU of a device */
- ifr->ifr_mtu = dev->mtu;
- return 0;
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
+}
- case SIOCSIFMTU: /* Set the MTU of a device */
- return dev_set_mtu(dev, ifr->ifr_mtu);
+static void rollback_registered_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
- case SIOCSIFHWADDR:
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
- case SIOCSIFHWBROADCAST:
- if (ifr->ifr_hwaddr.sa_family != dev->type)
- return -EINVAL;
- memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- notifier_call_chain(&netdev_chain,
- NETDEV_CHANGEADDR, dev);
- return 0;
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
- case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head);
- case SIOCSIFMAP:
- if (dev->set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev->set_config(dev, &ifr->ifr_map);
- }
- return -EOPNOTSUPP;
-
- case SIOCADDMULTI:
- if (!dev->set_multicast_list ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
- dev->addr_len, 1);
-
- case SIOCDELMULTI:
- if (!dev->set_multicast_list ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
- dev->addr_len, 1);
-
- case SIOCGIFINDEX:
- ifr->ifr_ifindex = dev->ifindex;
- return 0;
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
- case SIOCGIFTXQLEN:
- ifr->ifr_qlen = dev->tx_queue_len;
- return 0;
+ dev->reg_state = NETREG_UNREGISTERING;
+ }
- case SIOCSIFTXQLEN:
- if (ifr->ifr_qlen < 0)
- return -EINVAL;
- dev->tx_queue_len = ifr->ifr_qlen;
- return 0;
+ synchronize_net();
- case SIOCSIFNAME:
- ifr->ifr_newname[IFNAMSIZ-1] = '\0';
- return dev_change_name(dev, ifr->ifr_newname);
+ list_for_each_entry(dev, head, unreg_list) {
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
/*
- * Unknown or private ioctl
+ * Flush the unicast and multicast chains
*/
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
- default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
- cmd == SIOCBONDRELEASE ||
- cmd == SIOCBONDSETHWADDR ||
- cmd == SIOCBONDSLAVEINFOQUERY ||
- cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (dev->do_ioctl) {
- if (netif_device_present(dev))
- err = dev->do_ioctl(dev, ifr,
- cmd);
- else
- err = -ENODEV;
- }
- } else
- err = -EINVAL;
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
}
- return err;
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list)
+ dev_put(dev);
}
-/*
- * This function handles all "interface"-type I/O control requests. The actual
- * 'doing' part of this is dev_ifsioc above.
- */
+static void rollback_registered(struct net_device *dev)
+{
+ LIST_HEAD(single);
-/**
- * dev_ioctl - network device ioctl
- * @cmd: command to issue
- * @arg: pointer to a struct ifreq in user space
- *
- * Issue ioctl functions to devices. This is normally called by the
- * user space syscall interfaces but can sometimes be useful for
- * other purposes. The return value is the return from the syscall if
- * positive or a negative errno code on error.
- */
+ list_add(&dev->unreg_list, &single);
+ rollback_registered_many(&single);
+ list_del(&single);
+}
-int dev_ioctl(unsigned int cmd, void __user *arg)
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
- struct ifreq ifr;
- int ret;
- char *colon;
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
- /* One special case: SIOCGIFCONF takes ifconf argument
- and requires shared lock, because it sleeps writing
- to user space.
- */
+ /* TSO requires that SG is present as well. */
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
- if (cmd == SIOCGIFCONF) {
- rtnl_shlock();
- ret = dev_ifconf((char __user *) arg);
- rtnl_shunlock();
- return ret;
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
}
- if (cmd == SIOCGIFNAME)
- return dev_ifname((struct ifreq __user *)arg);
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
- ifr.ifr_name[IFNAMSIZ-1] = 0;
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
- colon = strchr(ifr.ifr_name, ':');
- if (colon)
- *colon = 0;
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
- /*
- * See which interface the caller is talking about.
- */
+ /* UFO needs SG and checksumming */
+ if (features & NETIF_F_UFO) {
+ /* maybe split UFO into V4 and V6? */
+ if (!((features & NETIF_F_GEN_CSUM) ||
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+ == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
+ features &= ~NETIF_F_UFO;
+ }
- switch (cmd) {
- /*
- * These ioctl calls:
- * - can be done by all.
- * - atomic and do not require locking.
- * - return a value
- */
- case SIOCGIFFLAGS:
- case SIOCGIFMETRIC:
- case SIOCGIFMTU:
- case SIOCGIFHWADDR:
- case SIOCGIFSLAVE:
- case SIOCGIFMAP:
- case SIOCGIFINDEX:
- case SIOCGIFTXQLEN:
- dev_load(ifr.ifr_name);
- read_lock(&dev_base_lock);
- ret = dev_ifsioc(&ifr, cmd);
- read_unlock(&dev_base_lock);
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+ if (!(features & NETIF_F_SG)) {
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
+ features &= ~NETIF_F_UFO;
+ }
+ }
- case SIOCETHTOOL:
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ethtool(&ifr);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - return a value
- */
- case SIOCGMIIPHY:
- case SIOCGMIIREG:
- case SIOCSIFNAME:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
+ return features;
+}
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - do not return a value
- */
- case SIOCSIFFLAGS:
- case SIOCSIFMETRIC:
- case SIOCSIFMTU:
- case SIOCSIFMAP:
- case SIOCSIFHWADDR:
- case SIOCSIFSLAVE:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCSIFHWBROADCAST:
- case SIOCSIFTXQLEN:
- case SIOCSMIIREG:
- case SIOCBONDENSLAVE:
- case SIOCBONDRELEASE:
- case SIOCBONDSETHWADDR:
- case SIOCBONDSLAVEINFOQUERY:
- case SIOCBONDINFOQUERY:
- case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- return ret;
+int __netdev_update_features(struct net_device *dev)
+{
+ netdev_features_t features;
+ int err = 0;
- case SIOCGIFMEM:
- /* Get the per device memory space. We can add this but
- * currently do not support it */
- case SIOCSIFMEM:
- /* Set the per device memory buffer space.
- * Not applicable in our case */
- case SIOCSIFLINK:
- return -EINVAL;
+ ASSERT_RTNL();
- /*
- * Unknown or private ioctl.
- */
- default:
- if (cmd == SIOCWANDEV ||
- (cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15)) {
- dev_load(ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(&ifr, cmd);
- rtnl_unlock();
- if (!ret && copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
- }
-#ifdef WIRELESS_EXT
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
- /* If command is `set a parameter', or
- * `get the encoding parameters', check if
- * the user has the right to do it */
- if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- }
- dev_load(ifr.ifr_name);
- rtnl_lock();
- /* Follow me in net/core/wireless.c */
- ret = wireless_process_ioctl(&ifr, cmd);
- rtnl_unlock();
- if (IW_IS_GET(cmd) &&
- copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
- }
-#endif /* WIRELESS_EXT */
- return -EINVAL;
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return 0;
+
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (unlikely(err < 0)) {
+ netdev_err(dev,
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
+ return -1;
}
+
+ if (!err)
+ dev->features = features;
+
+ return 1;
}
+/**
+ * netdev_update_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications if it
+ * has changed. Should be called after driver or hardware dependent
+ * conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+ if (__netdev_update_features(dev))
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
/**
- * dev_new_index - allocate an ifindex
+ * netdev_change_features - recalculate device features
+ * @dev: the device to check
*
- * Returns a suitable unique value for a new device interface
- * number. The caller must hold the rtnl semaphore or the
- * dev_base_lock to be sure it remains unique.
+ * Recalculate dev->features set and send notifications even
+ * if they have not changed. Should be called instead of
+ * netdev_update_features() if also dev->vlan_features might
+ * have changed to allow the changes to be propagated to stacked
+ * VLAN devices.
*/
-static int dev_new_index(void)
+void netdev_change_features(struct net_device *dev)
{
- static int ifindex;
- for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(ifindex))
- return ifindex;
+ __netdev_update_features(dev);
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
+
+/**
+ * netif_stacked_transfer_operstate - transfer operstate
+ * @rootdev: the root or lower level device to transfer state from
+ * @dev: the device to transfer operstate to
+ *
+ * Transfer operational state from root to device. This is normally
+ * called when a stacking relationship exists between the root
+ * device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev)
+{
+ if (rootdev->operstate == IF_OPER_DORMANT)
+ netif_dormant_on(dev);
+ else
+ netif_dormant_off(dev);
+
+ if (netif_carrier_ok(rootdev)) {
+ if (!netif_carrier_ok(dev))
+ netif_carrier_on(dev);
+ } else {
+ if (netif_carrier_ok(dev))
+ netif_carrier_off(dev);
}
}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
-static int dev_boot_phase = 1;
+#ifdef CONFIG_SYSFS
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
+ struct netdev_rx_queue *rx;
-/* Delayed registration/unregisteration */
-static DEFINE_SPINLOCK(net_todo_list_lock);
-static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
+ BUG_ON(count < 1);
+
+ rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+ if (!rx)
+ return -ENOMEM;
+
+ dev->_rx = rx;
-static inline void net_set_todo(struct net_device *dev)
+ for (i = 0; i < count; i++)
+ rx[i].dev = dev;
+ return 0;
+}
+#endif
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue, void *_unused)
{
- spin_lock(&net_todo_list_lock);
- list_add_tail(&dev->todo_list, &net_todo_list);
- spin_unlock(&net_todo_list_lock);
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+ queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
+}
+
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+ unsigned int count = dev->num_tx_queues;
+ struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
+
+ BUG_ON(count < 1 || count > 0xffff);
+
+ tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!tx) {
+ tx = vzalloc(sz);
+ if (!tx)
+ return -ENOMEM;
+ }
+ dev->_tx = tx;
+
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+
+ return 0;
}
/**
@@ -2658,98 +6011,89 @@ static inline void net_set_todo(struct net_device *dev)
int register_netdevice(struct net_device *dev)
{
- struct hlist_head *head;
- struct hlist_node *p;
int ret;
+ struct net *net = dev_net(dev);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
+ might_sleep();
+
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+ BUG_ON(!net);
- spin_lock_init(&dev->queue_lock);
- spin_lock_init(&dev->xmit_lock);
- dev->xmit_lock_owner = -1;
-#ifdef CONFIG_NET_CLS_ACT
- spin_lock_init(&dev->ingress_lock);
-#endif
-
- ret = alloc_divert_blk(dev);
- if (ret)
- goto out;
+ spin_lock_init(&dev->addr_list_lock);
+ netdev_set_addr_lockdep_class(dev);
dev->iflink = -1;
+ ret = dev_get_valid_name(net, dev, dev->name);
+ if (ret < 0)
+ goto out;
+
/* Init, if this function is available */
- if (dev->init) {
- ret = dev->init(dev);
+ if (dev->netdev_ops->ndo_init) {
+ ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
- goto out_err;
+ goto out;
}
}
-
- if (!dev_valid_name(dev->name)) {
+
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
- goto out_err;
+ goto err_uninit;
}
- dev->ifindex = dev_new_index();
+ ret = -EBUSY;
+ if (!dev->ifindex)
+ dev->ifindex = dev_new_index(net);
+ else if (__dev_get_by_index(net, dev->ifindex))
+ goto err_uninit;
+
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
- /* Check for existence of name */
- head = dev_name_hash(dev->name);
- hlist_for_each(p, head) {
- struct net_device *d
- = hlist_entry(p, struct net_device, name_hlist);
- if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
- ret = -EEXIST;
- goto out_err;
- }
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = dev->features & dev->hw_features;
- /* Fix illegal SG+CSUM combinations. */
- if ((dev->features & NETIF_F_SG) &&
- !(dev->features & (NETIF_F_IP_CSUM |
- NETIF_F_NO_CSUM |
- NETIF_F_HW_CSUM))) {
- printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_SG;
+ if (!(dev->flags & IFF_LOOPBACK)) {
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
}
- /* TSO requires that SG is present as well. */
- if ((dev->features & NETIF_F_TSO) &&
- !(dev->features & NETIF_F_SG)) {
- printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_TSO;
- }
- if (dev->features & NETIF_F_UFO) {
- if (!(dev->features & NETIF_F_HW_CSUM)) {
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
- "NETIF_F_HW_CSUM feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_UFO;
- }
- if (!(dev->features & NETIF_F_SG)) {
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
- "NETIF_F_SG feature.\n",
- dev->name);
- dev->features &= ~NETIF_F_UFO;
- }
- }
+ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
+ */
+ dev->vlan_features |= NETIF_F_HIGHDMA;
- /*
- * nil rebuild_header routine,
- * that should be never called and used as just bug trap.
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG;
+
+ /* Make NETIF_F_SG inheritable to MPLS.
*/
+ dev->mpls_features |= NETIF_F_SG;
+
+ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto err_uninit;
+
+ ret = netdev_register_kobject(dev);
+ if (ret)
+ goto err_uninit;
+ dev->reg_state = NETREG_REGISTERED;
- if (!dev->rebuild_header)
- dev->rebuild_header = default_rebuild_header;
+ __netdev_update_features(dev);
/*
* Default initial state at registry is that the
@@ -2758,30 +6102,85 @@ int register_netdevice(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
- dev->next = NULL;
+ linkwatch_init_dev(dev);
+
dev_init_scheduler(dev);
- write_lock_bh(&dev_base_lock);
- *dev_tail = dev;
- dev_tail = &dev->next;
- hlist_add_head(&dev->name_hlist, head);
- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
dev_hold(dev);
- dev->reg_state = NETREG_REGISTERING;
- write_unlock_bh(&dev_base_lock);
+ list_netdevice(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
- /* Notify protocols, that a new device appeared. */
- notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
- /* Finish registration after unlock */
- net_set_todo(dev);
- ret = 0;
+ /* Notify protocols, that a new device appeared. */
+ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ rollback_registered(dev);
+ dev->reg_state = NETREG_UNREGISTERED;
+ }
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
-out_err:
- free_divert_blk(dev);
+
+err_uninit:
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
goto out;
}
+EXPORT_SYMBOL(register_netdevice);
+
+/**
+ * init_dummy_netdev - init a dummy network device for NAPI
+ * @dev: device to init
+ *
+ * This takes a network device structure and initialize the minimum
+ * amount of fields so it can be used to schedule NAPI polls without
+ * registering a full blown interface. This is to be used by drivers
+ * that need to tie several hardware interfaces to a single NAPI
+ * poll scheduler due to HW limitations.
+ */
+int init_dummy_netdev(struct net_device *dev)
+{
+ /* Clear everything. Note we don't initialize spinlocks
+ * are they aren't supposed to be taken by any of the
+ * NAPI code and this dummy netdev is supposed to be
+ * only ever used for NAPI polls
+ */
+ memset(dev, 0, sizeof(struct net_device));
+
+ /* make sure we BUG if trying to hit standard
+ * register/unregister code path
+ */
+ dev->reg_state = NETREG_DUMMY;
+
+ /* NAPI wants this */
+ INIT_LIST_HEAD(&dev->napi_list);
+
+ /* a dummy interface is started by default */
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(init_dummy_netdev);
+
/**
* register_netdev - register a network device
@@ -2792,7 +6191,7 @@ out_err:
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
- * This is a wrapper around register_netdev that takes the rtnl semaphore
+ * This is a wrapper around register_netdevice that takes the rtnl semaphore
* and expands the device name if you passed a format string to
* alloc_netdev.
*/
@@ -2801,35 +6200,25 @@ int register_netdev(struct net_device *dev)
int err;
rtnl_lock();
-
- /*
- * If the name is a format string the caller wants us to do a
- * name allocation.
- */
- if (strchr(dev->name, '%')) {
- err = dev_alloc_name(dev, dev->name);
- if (err < 0)
- goto out;
- }
-
- /*
- * Back compatibility hook. Kill this one in 2.5
- */
- if (dev->name[0] == 0 || dev->name[0] == ' ') {
- err = dev_alloc_name(dev, "eth%d");
- if (err < 0)
- goto out;
- }
-
err = register_netdevice(dev);
-out:
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(register_netdev);
-/*
+int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
+/**
* netdev_wait_allrefs - wait until all references are gone.
+ * @dev: target net_device
*
* This is called when unregistering network devices.
*
@@ -2837,21 +6226,30 @@ EXPORT_SYMBOL(register_netdev);
* for netdevice notification, and cleanup and put back the
* reference if they receive an UNREGISTER event.
* We can get stuck here if buggy protocols don't correctly
- * call dev_put.
+ * call dev_put.
*/
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
+ int refcnt;
+
+ linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
- rtnl_shlock();
+ rtnl_lock();
/* Rebroadcast unregister notification */
- notifier_call_chain(&netdev_chain,
- NETDEV_UNREGISTER, dev);
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
@@ -2863,18 +6261,18 @@ static void netdev_wait_allrefs(struct net_device *dev)
linkwatch_run_queue();
}
- rtnl_shunlock();
+ __rtnl_unlock();
rebroadcast_time = jiffies;
}
msleep(250);
+ refcnt = netdev_refcnt_read(dev);
+
if (time_after(jiffies, warning_time + 10 * HZ)) {
- printk(KERN_EMERG "unregister_netdevice: "
- "waiting for %s to become free. Usage "
- "count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -2894,274 +6292,522 @@ static void netdev_wait_allrefs(struct net_device *dev)
* free_netdev(y1);
* free_netdev(y2);
*
- * We are invoked by rtnl_unlock() after it drops the semaphore.
+ * We are invoked by rtnl_unlock().
* This allows us to deal with problems:
- * 1) We can create/delete sysfs objects which invoke hotplug
+ * 1) We can delete sysfs objects which invoke hotplug
* without deadlocking with linkwatch via keventd.
* 2) Since we run with the RTNL semaphore not held, we can sleep
* safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
*/
-static DECLARE_MUTEX(net_todo_run_mutex);
void netdev_run_todo(void)
{
- struct list_head list = LIST_HEAD_INIT(list);
- int err;
+ struct list_head list;
+ /* Snapshot list, allow later requests */
+ list_replace_init(&net_todo_list, &list);
- /* Need to guard against multiple cpu's getting out of order. */
- down(&net_todo_run_mutex);
+ __rtnl_unlock();
- /* Not safe to do outside the semaphore. We must not return
- * until all unregister events invoked by the local processor
- * have been completed (either by this todo run, or one on
- * another cpu).
- */
- if (list_empty(&net_todo_list))
- goto out;
- /* Snapshot list, allow later requests */
- spin_lock(&net_todo_list_lock);
- list_splice_init(&net_todo_list, &list);
- spin_unlock(&net_todo_list_lock);
-
+ /* Wait for rcu callbacks to finish before next phase */
+ if (!list_empty(&list))
+ rcu_barrier();
+
while (!list_empty(&list)) {
struct net_device *dev
- = list_entry(list.next, struct net_device, todo_list);
+ = list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
- switch(dev->reg_state) {
- case NETREG_REGISTERING:
- err = netdev_register_sysfs(dev);
- if (err)
- printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
- dev->name, err);
- dev->reg_state = NETREG_REGISTERED;
- break;
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ __rtnl_unlock();
+
+ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
+ pr_err("network todo '%s' but state %d\n",
+ dev->name, dev->reg_state);
+ dump_stack();
+ continue;
+ }
- case NETREG_UNREGISTERING:
- netdev_unregister_sysfs(dev);
- dev->reg_state = NETREG_UNREGISTERED;
+ dev->reg_state = NETREG_UNREGISTERED;
- netdev_wait_allrefs(dev);
+ on_each_cpu(flush_backlog, dev, 1);
- /* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
- BUG_TRAP(!dev->ip_ptr);
- BUG_TRAP(!dev->ip6_ptr);
- BUG_TRAP(!dev->dn_ptr);
+ netdev_wait_allrefs(dev);
+ /* paranoia */
+ BUG_ON(netdev_refcnt_read(dev));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+ WARN_ON(dev->dn_ptr);
- /* It must be the very last action,
- * after this 'dev' may point to freed up memory.
- */
- if (dev->destructor)
- dev->destructor(dev);
- break;
+ if (dev->destructor)
+ dev->destructor(dev);
- default:
- printk(KERN_ERR "network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- break;
- }
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
+ /* Free network device */
+ kobject_put(&dev->dev.kobj);
}
+}
-out:
- up(&net_todo_run_mutex);
+/* Convert net_device_stats to rtnl_link_stats64. They have the same
+ * fields in the same order, with only the type differing.
+ */
+void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+ BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+ size_t i, n = sizeof(*stats64) / sizeof(u64);
+ const unsigned long *src = (const unsigned long *)netdev_stats;
+ u64 *dst = (u64 *)stats64;
+
+ BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+ sizeof(*stats64) / sizeof(u64));
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
+#endif
}
+EXPORT_SYMBOL(netdev_stats_to_stats64);
/**
- * alloc_netdev - allocate network device
+ * dev_get_stats - get network device statistics
+ * @dev: device to get statistics from
+ * @storage: place to store stats
+ *
+ * Get network statistics from device. Return @storage.
+ * The device driver may provide its own method by setting
+ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ * otherwise the internal statistics structure is used.
+ */
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_get_stats64) {
+ memset(storage, 0, sizeof(*storage));
+ ops->ndo_get_stats64(dev, storage);
+ } else if (ops->ndo_get_stats) {
+ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else {
+ netdev_stats_to_stats64(storage, &dev->stats);
+ }
+ storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+ return storage;
+}
+EXPORT_SYMBOL(dev_get_stats);
+
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+ struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (queue)
+ return queue;
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return NULL;
+ netdev_init_one_queue(dev, queue, NULL);
+ queue->qdisc = &noop_qdisc;
+ queue->qdisc_sleeping = &noop_qdisc;
+ rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+ return queue;
+}
+
+static const struct ethtool_ops default_ethtool_ops;
+
+void netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops)
+{
+ if (dev->ethtool_ops == &default_ethtool_ops)
+ dev->ethtool_ops = ops;
+}
+EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ kvfree(addr);
+}
+
+/**
+ * alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization.
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
{
- void *p;
struct net_device *dev;
- int alloc_size;
+ size_t alloc_size;
+ struct net_device *p;
- /* ensure 32-byte alignment of both the device and private area */
- alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
- alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
+ BUG_ON(strlen(name) >= sizeof(dev->name));
- p = kmalloc(alloc_size, GFP_KERNEL);
- if (!p) {
- printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
+ if (txqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
+ return NULL;
+ }
+
+#ifdef CONFIG_SYSFS
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
- memset(p, 0, alloc_size);
+#endif
+
+ alloc_size = sizeof(struct net_device);
+ if (sizeof_priv) {
+ /* ensure 32-byte alignment of private area */
+ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
+ alloc_size += sizeof_priv;
+ }
+ /* ensure 32-byte alignment of whole construct */
+ alloc_size += NETDEV_ALIGN - 1;
+
+ p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!p)
+ p = vzalloc(alloc_size);
+ if (!p)
+ return NULL;
- dev = (struct net_device *)
- (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
+ dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
- if (sizeof_priv)
- dev->priv = netdev_priv(dev);
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ goto free_dev;
+
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
+ dev_mc_init(dev);
+ dev_uc_init(dev);
+
+ dev_net_set(dev, &init_net);
+
+ dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->all_adj_list.upper);
+ INIT_LIST_HEAD(&dev->all_adj_list.lower);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+#ifdef CONFIG_SYSFS
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+#endif
+
strcpy(dev->name, name);
+ dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
return dev;
+
+free_all:
+ free_netdev(dev);
+ return NULL;
+
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
+free_dev:
+ netdev_freemem(dev);
+ return NULL;
}
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mqs);
/**
* free_netdev - free network device
* @dev: device
*
- * This function does the last stage of destroying an allocated device
- * interface. The reference to the device object is released.
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released.
* If this is the last reference then it will be freed.
*/
void free_netdev(struct net_device *dev)
{
+ struct napi_struct *p, *n;
+
+ release_net(dev_net(dev));
+
+ netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
- /* Compatiablity with error handling in drivers */
+ kfree(dev->_rx);
+#endif
+
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ netif_napi_del(p);
+
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
+ /* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
- kfree((char *)dev - dev->padded);
+ netdev_freemem(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
dev->reg_state = NETREG_RELEASED;
- /* will free via class release */
- class_device_put(&dev->class_dev);
-#else
- kfree((char *)dev - dev->padded);
-#endif
+ /* will free via device release */
+ put_device(&dev->dev);
}
-
-/* Synchronize with packet receive processing. */
-void synchronize_net(void)
+EXPORT_SYMBOL(free_netdev);
+
+/**
+ * synchronize_net - Synchronize with packet receive processing
+ *
+ * Wait for packets currently being received to be done.
+ * Does not block later packets from starting.
+ */
+void synchronize_net(void)
{
might_sleep();
- synchronize_rcu();
+ if (rtnl_is_locked())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
}
+EXPORT_SYMBOL(synchronize_net);
/**
- * unregister_netdevice - remove device from the kernel
+ * unregister_netdevice_queue - remove device from the kernel
* @dev: device
+ * @head: list
*
* This function shuts down a device interface and removes it
- * from the kernel tables. On success 0 is returned, on a failure
- * a negative errno code is returned.
+ * from the kernel tables.
+ * If head not NULL, device is queued to be unregistered later.
*
* Callers must hold the rtnl semaphore. You may want
* unregister_netdev() instead of this.
*/
-int unregister_netdevice(struct net_device *dev)
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
- struct net_device *d, **dp;
-
- BUG_ON(dev_boot_phase);
ASSERT_RTNL();
- /* Some devices call without registering for initialization unwind. */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
- "was registered\n", dev->name, dev);
- return -ENODEV;
+ if (head) {
+ list_move_tail(&dev->unreg_list, head);
+ } else {
+ rollback_registered(dev);
+ /* Finish processing unregister after unlock */
+ net_set_todo(dev);
}
+}
+EXPORT_SYMBOL(unregister_netdevice_queue);
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
+/**
+ * unregister_netdevice_many - unregister many devices
+ * @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+ struct net_device *dev;
- /* If device is running, close it first. */
- if (dev->flags & IFF_UP)
- dev_close(dev);
-
- /* And unlink it from device chain. */
- for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
- if (d == dev) {
- write_lock_bh(&dev_base_lock);
- hlist_del(&dev->name_hlist);
- hlist_del(&dev->index_hlist);
- if (dev_tail == &dev->next)
- dev_tail = dp;
- *dp = d->next;
- write_unlock_bh(&dev_base_lock);
- break;
- }
+ if (!list_empty(head)) {
+ rollback_registered_many(head);
+ list_for_each_entry(dev, head, unreg_list)
+ net_set_todo(dev);
+ list_del(head);
}
- if (!d) {
- printk(KERN_ERR "unregister net_device: '%s' not found\n",
- dev->name);
- return -ENODEV;
+}
+EXPORT_SYMBOL(unregister_netdevice_many);
+
+/**
+ * unregister_netdev - remove device from the kernel
+ * @dev: device
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ *
+ * This is just a wrapper for unregister_netdevice that takes
+ * the rtnl semaphore. In general you want to use this and not
+ * unregister_netdevice.
+ */
+void unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ unregister_netdevice(dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev);
+
+/**
+ * dev_change_net_namespace - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ */
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ /* Don't allow namespace local devices to be moved. */
+ err = -EINVAL;
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ goto out;
+
+ /* Ensure the device has been registrered */
+ if (dev->reg_state != NETREG_REGISTERED)
+ goto out;
+
+ /* Get out if there is nothing todo */
+ err = 0;
+ if (net_eq(dev_net(dev), net))
+ goto out;
+
+ /* Pick the destination device name, and ensure
+ * we can use it in the destination network namespace.
+ */
+ err = -EEXIST;
+ if (__dev_get_by_name(net, dev->name)) {
+ /* We get here if we can't use the current device name */
+ if (!pat)
+ goto out;
+ if (dev_get_valid_name(net, dev, pat) < 0)
+ goto out;
}
- dev->reg_state = NETREG_UNREGISTERING;
+ /*
+ * And now a mini version of register_netdevice unregister_netdevice.
+ */
+
+ /* If device is running close it first. */
+ dev_close(dev);
+
+ /* And unlink it from device chain */
+ err = -ENODEV;
+ unlist_netdevice(dev);
synchronize_net();
/* Shutdown queueing discipline. */
dev_shutdown(dev);
-
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
+
+ Note that dev->reg_state stays at NETREG_REGISTERED.
+ This is wanted because this way 8021q and macvlan know
+ the device is just moving and can keep their slaves up.
*/
- notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
-
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ rcu_barrier();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
/*
- * Flush the multicast chain
+ * Flush the unicast and multicast chains
*/
- dev_mc_discard(dev);
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
- if (dev->uninit)
- dev->uninit(dev);
+ /* Send a netdev-removed uevent to the old namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
- /* Notifier chain MUST detach us from master device. */
- BUG_TRAP(!dev->master);
+ /* Actually switch the network namespace */
+ dev_net_set(dev, net);
- free_divert_blk(dev);
+ /* If there is an ifindex conflict assign a new one */
+ if (__dev_get_by_index(net, dev->ifindex)) {
+ int iflink = (dev->iflink == dev->ifindex);
+ dev->ifindex = dev_new_index(net);
+ if (iflink)
+ dev->iflink = dev->ifindex;
+ }
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
- synchronize_net();
+ /* Fixup kobjects */
+ err = device_rename(&dev->dev, dev->name);
+ WARN_ON(err);
- dev_put(dev);
- return 0;
-}
+ /* Add the device back in the hashes */
+ list_netdevice(dev);
-/**
- * unregister_netdev - remove device from the kernel
- * @dev: device
- *
- * This function shuts down a device interface and removes it
- * from the kernel tables. On success 0 is returned, on a failure
- * a negative errno code is returned.
- *
- * This is just a wrapper for unregister_netdevice that takes
- * the rtnl semaphore. In general you want to use this and not
- * unregister_netdevice.
- */
-void unregister_netdev(struct net_device *dev)
-{
- rtnl_lock();
- unregister_netdevice(dev);
- rtnl_unlock();
-}
+ /* Notify protocols, that a new device appeared. */
+ call_netdevice_notifiers(NETDEV_REGISTER, dev);
-EXPORT_SYMBOL(unregister_netdev);
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+
+ synchronize_net();
+ err = 0;
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
-#ifdef CONFIG_HOTPLUG_CPU
static int dev_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *ocpu)
{
struct sk_buff **list_skb;
- struct net_device **list_net;
struct sk_buff *skb;
unsigned int cpu, oldcpu = (unsigned long)ocpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD)
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
local_irq_disable();
@@ -3177,26 +6823,301 @@ static int dev_cpu_callback(struct notifier_block *nfb,
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
- /* Find end of our output_queue. */
- list_net = &sd->output_queue;
- while (*list_net)
- list_net = &(*list_net)->next_sched;
/* Append output queue from offline CPU. */
- *list_net = oldsd->output_queue;
- oldsd->output_queue = NULL;
+ if (oldsd->output_queue) {
+ *sd->output_queue_tailp = oldsd->output_queue;
+ sd->output_queue_tailp = oldsd->output_queue_tailp;
+ oldsd->output_queue = NULL;
+ oldsd->output_queue_tailp = &oldsd->output_queue;
+ }
+ /* Append NAPI poll list from offline CPU. */
+ if (!list_empty(&oldsd->poll_list)) {
+ list_splice_init(&oldsd->poll_list, &sd->poll_list);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ }
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
- netif_rx(skb);
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+ netif_rx_internal(skb);
+ input_queue_head_incr(oldsd);
+ }
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+ netif_rx_internal(skb);
+ input_queue_head_incr(oldsd);
+ }
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
+/**
+ * netdev_increment_features - increment feature set by one
+ * @all: current feature set
+ * @one: new feature set
+ * @mask: mask feature set
+ *
+ * Computes a new feature set after adding a device with feature set
+ * @one to the master device with current feature set @all. Will not
+ * enable anything that is off in @mask. Returns the new feature set.
+ */
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
+{
+ if (mask & NETIF_F_GEN_CSUM)
+ mask |= NETIF_F_ALL_CSUM;
+ mask |= NETIF_F_VLAN_CHALLENGED;
+
+ all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+ all &= one | ~NETIF_F_ALL_FOR_ALL;
+
+ /* If one device supports hw checksumming, set for all. */
+ if (all & NETIF_F_GEN_CSUM)
+ all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+
+ return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
+
+static struct hlist_head * __net_init netdev_create_hash(void)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+ if (net != &init_net)
+ INIT_LIST_HEAD(&net->dev_base_head);
+
+ net->dev_name_head = netdev_create_hash();
+ if (net->dev_name_head == NULL)
+ goto err_name;
+
+ net->dev_index_head = netdev_create_hash();
+ if (net->dev_index_head == NULL)
+ goto err_idx;
+
+ return 0;
+
+err_idx:
+ kfree(net->dev_name_head);
+err_name:
+ return -ENOMEM;
+}
+
+/**
+ * netdev_drivername - network driver for the device
+ * @dev: network device
+ *
+ * Determine network driver for device.
+ */
+const char *netdev_drivername(const struct net_device *dev)
+{
+ const struct device_driver *driver;
+ const struct device *parent;
+ const char *empty = "";
+
+ parent = dev->dev.parent;
+ if (!parent)
+ return empty;
+
+ driver = parent->driver;
+ if (driver && driver->name)
+ return driver->name;
+ return empty;
+}
+
+static int __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
+{
+ int r;
+
+ if (dev && dev->dev.parent) {
+ r = dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), vaf);
+ } else if (dev) {
+ r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+ } else {
+ r = printk("%s(NULL net_device): %pV", level, vaf);
+ }
+
+ return r;
+}
+
+int netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int r;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ r = __netdev_printk(level, dev, &vaf);
+
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level) \
+int func(const struct net_device *dev, const char *fmt, ...) \
+{ \
+ int r; \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ r = __netdev_printk(level, dev, &vaf); \
+ \
+ va_end(args); \
+ \
+ return r; \
+} \
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
+static void __net_exit netdev_exit(struct net *net)
+{
+ kfree(net->dev_name_head);
+ kfree(net->dev_index_head);
+}
+
+static struct pernet_operations __net_initdata netdev_net_ops = {
+ .init = netdev_init,
+ .exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit(struct net *net)
+{
+ struct net_device *dev, *aux;
+ /*
+ * Push all migratable network devices back to the
+ * initial network namespace
+ */
+ rtnl_lock();
+ for_each_netdev_safe(net, dev, aux) {
+ int err;
+ char fb_name[IFNAMSIZ];
+
+ /* Ignore unmoveable devices (i.e. loopback) */
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ continue;
+
+ /* Leave virtual devices for the generic cleanup */
+ if (dev->rtnl_link_ops)
+ continue;
+
+ /* Push remaining network devices to init_net */
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
+ BUG();
+ }
+ }
+ rtnl_unlock();
+}
+
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+ /* At exit all network devices most be removed from a network
+ * namespace. Do this in the reverse order of registration.
+ * Do this across as many network namespaces as possible to
+ * improve batching efficiency.
+ */
+ struct net_device *dev;
+ struct net *net;
+ LIST_HEAD(dev_kill_list);
+
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
+ list_for_each_entry(net, net_list, exit_list) {
+ for_each_netdev_reverse(net, dev) {
+ if (dev->rtnl_link_ops)
+ dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+ else
+ unregister_netdevice_queue(dev, &dev_kill_list);
+ }
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+ .exit = default_device_exit,
+ .exit_batch = default_device_exit_batch,
+};
+
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
@@ -3214,100 +7135,67 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase);
- net_random_init();
-
if (dev_proc_init())
goto out;
- if (netdev_sysfs_init())
+ if (netdev_kobject_init())
goto out;
INIT_LIST_HEAD(&ptype_all);
- for (i = 0; i < 16; i++)
+ for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
- for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
- INIT_HLIST_HEAD(&dev_name_head[i]);
+ INIT_LIST_HEAD(&offload_base);
- for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
- INIT_HLIST_HEAD(&dev_index_head[i]);
+ if (register_pernet_subsys(&netdev_net_ops))
+ goto out;
/*
* Initialise the packet receive queues.
*/
- for (i = 0; i < NR_CPUS; i++) {
- struct softnet_data *queue;
+ for_each_possible_cpu(i) {
+ struct softnet_data *sd = &per_cpu(softnet_data, i);
+
+ skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init(&sd->process_queue);
+ INIT_LIST_HEAD(&sd->poll_list);
+ sd->output_queue_tailp = &sd->output_queue;
+#ifdef CONFIG_RPS
+ sd->csd.func = rps_trigger_softirq;
+ sd->csd.info = sd;
+ sd->cpu = i;
+#endif
- queue = &per_cpu(softnet_data, i);
- skb_queue_head_init(&queue->input_pkt_queue);
- queue->completion_queue = NULL;
- INIT_LIST_HEAD(&queue->poll_list);
- set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
- queue->backlog_dev.weight = weight_p;
- queue->backlog_dev.poll = process_backlog;
- atomic_set(&queue->backlog_dev.refcnt, 1);
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
}
dev_boot_phase = 0;
- open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
- open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+ /* The loopback device is special if any other network devices
+ * is present in a network namespace the loopback device must
+ * be present. Since we now dynamically allocate and free the
+ * loopback device ensure this invariant is maintained by
+ * keeping the loopback device as the first device on the
+ * list of network devices. Ensuring the loopback devices
+ * is the first device that appears and the last network device
+ * that disappears.
+ */
+ if (register_pernet_device(&loopback_net_ops))
+ goto out;
+
+ if (register_pernet_device(&default_device_ops))
+ goto out;
+
+ open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+ open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
- dev_mcast_init();
rc = 0;
out:
return rc;
}
subsys_initcall(net_dev_init);
-
-EXPORT_SYMBOL(__dev_get_by_index);
-EXPORT_SYMBOL(__dev_get_by_name);
-EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(__skb_linearize);
-EXPORT_SYMBOL(dev_valid_name);
-EXPORT_SYMBOL(dev_add_pack);
-EXPORT_SYMBOL(dev_alloc_name);
-EXPORT_SYMBOL(dev_close);
-EXPORT_SYMBOL(dev_get_by_flags);
-EXPORT_SYMBOL(dev_get_by_index);
-EXPORT_SYMBOL(dev_get_by_name);
-EXPORT_SYMBOL(dev_open);
-EXPORT_SYMBOL(dev_queue_xmit);
-EXPORT_SYMBOL(dev_remove_pack);
-EXPORT_SYMBOL(dev_set_allmulti);
-EXPORT_SYMBOL(dev_set_promiscuity);
-EXPORT_SYMBOL(dev_change_flags);
-EXPORT_SYMBOL(dev_set_mtu);
-EXPORT_SYMBOL(dev_set_mac_address);
-EXPORT_SYMBOL(free_netdev);
-EXPORT_SYMBOL(netdev_boot_setup_check);
-EXPORT_SYMBOL(netdev_set_master);
-EXPORT_SYMBOL(netdev_state_change);
-EXPORT_SYMBOL(netif_receive_skb);
-EXPORT_SYMBOL(netif_rx);
-EXPORT_SYMBOL(register_gifconf);
-EXPORT_SYMBOL(register_netdevice);
-EXPORT_SYMBOL(register_netdevice_notifier);
-EXPORT_SYMBOL(skb_checksum_help);
-EXPORT_SYMBOL(synchronize_net);
-EXPORT_SYMBOL(unregister_netdevice);
-EXPORT_SYMBOL(unregister_netdevice_notifier);
-EXPORT_SYMBOL(net_enable_timestamp);
-EXPORT_SYMBOL(net_disable_timestamp);
-EXPORT_SYMBOL(dev_get_flags);
-
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
-EXPORT_SYMBOL(br_handle_frame_hook);
-EXPORT_SYMBOL(br_fdb_get_hook);
-EXPORT_SYMBOL(br_fdb_put_hook);
-#endif
-
-#ifdef CONFIG_KMOD
-EXPORT_SYMBOL(dev_load);
-#endif
-
-EXPORT_PER_CPU_SYMBOL(softnet_data);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 00000000000..b6b230600b9
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,851 @@
+/*
+ * net/core/dev_addr_lists.c - Functions for handling net device lists
+ * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This file contains functions for working with unicast, multicast and device
+ * addresses lists.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/export.h>
+#include <linux/list.h>
+
+/*
+ * General list handling functions
+ */
+
+static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global,
+ bool sync)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ ha->refcount = 1;
+ ha->global_use = global;
+ ha->synced = sync ? 1 : 0;
+ ha->sync_cnt = 0;
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
+}
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync,
+ int sync_count)
+{
+ struct netdev_hw_addr *ha;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (!memcmp(ha->addr, addr, addr_len) &&
+ ha->type == addr_type) {
+ if (global) {
+ /* check if addr is already used as global */
+ if (ha->global_use)
+ return 0;
+ else
+ ha->global_use = true;
+ }
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
+ sync);
+}
+
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
+ 0);
+}
+
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha, bool global,
+ bool sync)
+{
+ if (global && !ha->global_use)
+ return -ENOENT;
+
+ if (sync && !ha->synced)
+ return -ENOENT;
+
+ if (global)
+ ha->global_use = false;
+
+ if (sync)
+ ha->synced--;
+
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ list->count--;
+ return 0;
+}
+
+static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
+{
+ struct netdev_hw_addr *ha;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (!memcmp(ha->addr, addr, addr_len) &&
+ (ha->type == addr_type || !addr_type))
+ return __hw_addr_del_entry(list, ha, global, sync);
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true, ha->sync_cnt);
+ if (err && err != -EEXIST)
+ return err;
+
+ if (!err) {
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true);
+ if (err)
+ return;
+ ha->sync_cnt--;
+ /* address on from list is not marked synced */
+ __hw_addr_del_entry(from_list, ha, false, false);
+}
+
+static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt == ha->refcount) {
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ } else {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destionation of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
+int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (!ha->sync_cnt) {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ } else if (ha->refcount == 1)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+ return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync);
+
+void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync);
+
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchonized addresses from device
+ * @list: address list to remove syncronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ }
+ list->count = 0;
+}
+
+void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+ INIT_LIST_HEAD(&list->list);
+ list->count = 0;
+}
+EXPORT_SYMBOL(__hw_addr_init);
+
+/*
+ * Device addresses handling functions
+ */
+
+/**
+ * dev_addr_flush - Flush device address list
+ * @dev: device
+ *
+ * Flush device address list and reset ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addrs);
+ dev->dev_addr = NULL;
+}
+EXPORT_SYMBOL(dev_addr_flush);
+
+/**
+ * dev_addr_init - Init device address list
+ * @dev: device
+ *
+ * Init device address list and create the first element,
+ * used by ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_init(&dev->dev_addrs);
+ memset(addr, 0, sizeof(addr));
+ err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_init);
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+ struct netdev_hw_addr *ha;
+
+ ASSERT_RTNL();
+
+ /*
+ * We can not remove the first address from the list because
+ * dev->dev_addr points to that.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == addr_type && ha->refcount == 1)
+ return -ENOENT;
+
+ err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/*
+ * Unicast list handling functions
+ */
+
+/**
+ * dev_uc_add_excl - Add a global secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->uc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_UNICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add_excl);
+
+/**
+ * dev_uc_add - Add a secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a secondary unicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add);
+
+/**
+ * dev_uc_del - Release secondary unicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a secondary unicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_del);
+
+/**
+ * dev_uc_sync - Synchronize device's unicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. This function assumes that
+ * addresses will only ever be synced to the @to devices and no other.
+ */
+int dev_uc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync);
+
+/**
+ * dev_uc_sync_multiple - Synchronize device's unicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have been deleted from the source. The source device
+ * must be locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. It allows for a single source
+ * device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
+/**
+ * dev_uc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_uc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_uc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_uc_unsync);
+
+/**
+ * dev_uc_flush - Flush unicast addresses
+ * @dev: device
+ *
+ * Flush unicast addresses.
+ */
+void dev_uc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->uc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_uc_flush);
+
+/**
+ * dev_uc_flush - Init unicast address list
+ * @dev: device
+ *
+ * Init unicast address list.
+ */
+void dev_uc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->uc);
+}
+EXPORT_SYMBOL(dev_uc_init);
+
+/*
+ * Multicast list handling functions
+ */
+
+/**
+ * dev_mc_add_excl - Add a global secondary multicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->mc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_add_excl);
+
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+/**
+ * dev_mc_add - Add a multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a multicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_add);
+
+/**
+ * dev_mc_add_global - Add a global multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a global multicast address to the device.
+ */
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_add_global);
+
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_mc_del - Delete a multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_del);
+
+/**
+ * dev_mc_del_global - Delete a global multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_del_global);
+
+/**
+ * dev_mc_sync - Synchronize device's multicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+/**
+ * dev_mc_sync_multiple - Synchronize device's multicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices. It allows for a single
+ * source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
+/**
+ * dev_mc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_mc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+/**
+ * dev_mc_flush - Flush multicast addresses
+ * @dev: device
+ *
+ * Flush multicast addresses.
+ */
+void dev_mc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->mc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_mc_flush);
+
+/**
+ * dev_mc_flush - Init multicast address list
+ * @dev: device
+ *
+ * Init multicast address list.
+ */
+void dev_mc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->mc);
+}
+EXPORT_SYMBOL(dev_mc_init);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
new file mode 100644
index 00000000000..cf999e09bcd
--- /dev/null
+++ b/net/core/dev_ioctl.c
@@ -0,0 +1,567 @@
+#include <linux/kmod.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/wireless.h>
+#include <net/wext.h>
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq __user *arg)
+{
+ struct ifreq ifr;
+ int error;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return 0;
+}
+
+static gifconf_func_t *gifconf_list[NPROTO];
+
+/**
+ * register_gifconf - register a SIOCGIF handler
+ * @family: Address family
+ * @gifconf: Function handler
+ *
+ * Register protocol dependent address dumping routines. The handler
+ * that is passed must not be freed or reused until it has been replaced
+ * by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+{
+ if (family >= NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+EXPORT_SYMBOL(register_gifconf);
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(struct net *net, char __user *arg)
+{
+ struct ifconf ifc;
+ struct net_device *dev;
+ char __user *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for_each_netdev(net, dev) {
+ for (i = 0; i < NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (!pos)
+ done = gifconf_list[i](dev, NULL, 0);
+ else
+ done = gifconf_list[i](dev, pos + total,
+ len - total);
+ if (done < 0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc.ifc_len = total;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (short) dev_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFHWADDR:
+ if (!dev->addr_len)
+ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+ else
+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ ifr->ifr_hwaddr.sa_family = dev->type;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start = dev->mem_start;
+ ifr->ifr_map.mem_end = dev->mem_end;
+ ifr->ifr_map.base_addr = dev->base_addr;
+ ifr->ifr_map.irq = dev->irq;
+ ifr->ifr_map.dma = dev->dma;
+ ifr->ifr_map.port = dev->if_port;
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -ENOTTY;
+ break;
+
+ }
+ return err;
+}
+
+static int net_hwtstamp_validate(struct ifreq *ifr)
+{
+ struct hwtstamp_config cfg;
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ if (cfg.flags) /* reserved for future extensions */
+ return -EINVAL;
+
+ tx_type = cfg.tx_type;
+ rx_filter = cfg.rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ tx_type_valid = 1;
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ rx_filter_valid = 1;
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ const struct net_device_ops *ops;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->netdev_ops;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ if (ops->ndo_set_config) {
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_set_config(dev, &ifr->ifr_map);
+ }
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCDELMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ dev->tx_queue_len = ifr->ifr_qlen;
+ return 0;
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ case SIOCSHWTSTAMP:
+ err = net_hwtstamp_validate(ifr);
+ if (err)
+ return err;
+ /* fall through */
+
+ /*
+ * Unknown or private ioctl
+ */
+ default:
+ if ((cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) ||
+ cmd == SIOCBONDENSLAVE ||
+ cmd == SIOCBONDRELEASE ||
+ cmd == SIOCBONDSETHWADDR ||
+ cmd == SIOCBONDSLAVEINFOQUERY ||
+ cmd == SIOCBONDINFOQUERY ||
+ cmd == SIOCBONDCHANGEACTIVE ||
+ cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCBRADDIF ||
+ cmd == SIOCBRDELIF ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP ||
+ cmd == SIOCWANDEV) {
+ err = -EOPNOTSUPP;
+ if (ops->ndo_do_ioctl) {
+ if (netif_device_present(dev))
+ err = ops->ndo_do_ioctl(dev, ifr, cmd);
+ else
+ err = -ENODEV;
+ }
+ } else
+ err = -EINVAL;
+
+ }
+ return err;
+}
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int no_module;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ rcu_read_unlock();
+
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE)) {
+ if (!request_module("%s", name))
+ pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
+ name);
+ }
+}
+EXPORT_SYMBOL(dev_load);
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @arg: pointer to a struct ifreq in user space
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ int ret;
+ char *colon;
+
+ /* One special case: SIOCGIFCONF takes ifconf argument
+ and requires shared lock, because it sleeps writing
+ to user space.
+ */
+
+ if (cmd == SIOCGIFCONF) {
+ rtnl_lock();
+ ret = dev_ifconf(net, (char __user *) arg);
+ rtnl_unlock();
+ return ret;
+ }
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, (struct ifreq __user *)arg);
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr.ifr_name);
+ rcu_read_lock();
+ ret = dev_ifsioc_locked(net, &ifr, cmd);
+ rcu_read_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ethtool(net, &ifr);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFMAP:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ /*
+ * These ioctl calls:
+ * - require local superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ case SIOCSHWTSTAMP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -ENOTTY;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ cmd == SIOCGHWTSTAMP ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret && copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ return ret;
+ }
+ /* Take care of Wireless Extensions */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
+ return wext_handle_ioctl(net, &ifr, cmd, arg);
+ return -ENOTTY;
+ }
+}
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
deleted file mode 100644
index 05d60850840..00000000000
--- a/net/core/dev_mcast.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Linux NET3: Multicast List maintenance.
- *
- * Authors:
- * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
- * Richard Underwood <richard@wuzz.demon.co.uk>
- *
- * Stir fried together from the IP multicast and CAP patches above
- * Alan Cox <Alan.Cox@linux.org>
- *
- * Fixes:
- * Alan Cox : Update the device on a real delete
- * rather than any time but...
- * Alan Cox : IFF_ALLMULTI support.
- * Alan Cox : New format set_multicast_list() calls.
- * Gleb Natapov : Remove dev_mc_lock.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/route.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/arp.h>
-
-
-/*
- * Device multicast list maintenance.
- *
- * This is used both by IP and by the user level maintenance functions.
- * Unlike BSD we maintain a usage count on a given multicast address so
- * that a casual user application can add/delete multicasts used by
- * protocols without doing damage to the protocols when it deletes the
- * entries. It also helps IP as it tracks overlapping maps.
- *
- * Device mc lists are changed by bh at least if IPv6 is enabled,
- * so that it must be bh protected.
- *
- * We block accesses to device mc filters with dev->xmit_lock.
- */
-
-/*
- * Update the multicast list into the physical NIC controller.
- */
-
-static void __dev_mc_upload(struct net_device *dev)
-{
- /* Don't do anything till we up the interface
- * [dev_open will call this function so the list will
- * stay sane]
- */
-
- if (!(dev->flags&IFF_UP))
- return;
-
- /*
- * Devices with no set multicast or which have been
- * detached don't get set.
- */
-
- if (dev->set_multicast_list == NULL ||
- !netif_device_present(dev))
- return;
-
- dev->set_multicast_list(dev);
-}
-
-void dev_mc_upload(struct net_device *dev)
-{
- spin_lock_bh(&dev->xmit_lock);
- __dev_mc_upload(dev);
- spin_unlock_bh(&dev->xmit_lock);
-}
-
-/*
- * Delete a device level multicast
- */
-
-int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
-{
- int err = 0;
- struct dev_mc_list *dmi, **dmip;
-
- spin_lock_bh(&dev->xmit_lock);
-
- for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
- /*
- * Find the entry we want to delete. The device could
- * have variable length entries so check these too.
- */
- if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
- alen == dmi->dmi_addrlen) {
- if (glbl) {
- int old_glbl = dmi->dmi_gusers;
- dmi->dmi_gusers = 0;
- if (old_glbl == 0)
- break;
- }
- if (--dmi->dmi_users)
- goto done;
-
- /*
- * Last user. So delete the entry.
- */
- *dmip = dmi->next;
- dev->mc_count--;
-
- kfree(dmi);
-
- /*
- * We have altered the list, so the card
- * loaded filter is now wrong. Fix it
- */
- __dev_mc_upload(dev);
-
- spin_unlock_bh(&dev->xmit_lock);
- return 0;
- }
- }
- err = -ENOENT;
-done:
- spin_unlock_bh(&dev->xmit_lock);
- return err;
-}
-
-/*
- * Add a device level multicast
- */
-
-int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
-{
- int err = 0;
- struct dev_mc_list *dmi, *dmi1;
-
- dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
-
- spin_lock_bh(&dev->xmit_lock);
- for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
- if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
- dmi->dmi_addrlen == alen) {
- if (glbl) {
- int old_glbl = dmi->dmi_gusers;
- dmi->dmi_gusers = 1;
- if (old_glbl)
- goto done;
- }
- dmi->dmi_users++;
- goto done;
- }
- }
-
- if ((dmi = dmi1) == NULL) {
- spin_unlock_bh(&dev->xmit_lock);
- return -ENOMEM;
- }
- memcpy(dmi->dmi_addr, addr, alen);
- dmi->dmi_addrlen = alen;
- dmi->next = dev->mc_list;
- dmi->dmi_users = 1;
- dmi->dmi_gusers = glbl ? 1 : 0;
- dev->mc_list = dmi;
- dev->mc_count++;
-
- __dev_mc_upload(dev);
-
- spin_unlock_bh(&dev->xmit_lock);
- return 0;
-
-done:
- spin_unlock_bh(&dev->xmit_lock);
- kfree(dmi1);
- return err;
-}
-
-/*
- * Discard multicast list when a device is downed
- */
-
-void dev_mc_discard(struct net_device *dev)
-{
- spin_lock_bh(&dev->xmit_lock);
-
- while (dev->mc_list != NULL) {
- struct dev_mc_list *tmp = dev->mc_list;
- dev->mc_list = tmp->next;
- if (tmp->dmi_users > tmp->dmi_gusers)
- printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users);
- kfree(tmp);
- }
- dev->mc_count = 0;
-
- spin_unlock_bh(&dev->xmit_lock);
-}
-
-#ifdef CONFIG_PROC_FS
-static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct net_device *dev;
- loff_t off = 0;
-
- read_lock(&dev_base_lock);
- for (dev = dev_base; dev; dev = dev->next) {
- if (off++ == *pos)
- return dev;
- }
- return NULL;
-}
-
-static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net_device *dev = v;
- ++*pos;
- return dev->next;
-}
-
-static void dev_mc_seq_stop(struct seq_file *seq, void *v)
-{
- read_unlock(&dev_base_lock);
-}
-
-
-static int dev_mc_seq_show(struct seq_file *seq, void *v)
-{
- struct dev_mc_list *m;
- struct net_device *dev = v;
-
- spin_lock_bh(&dev->xmit_lock);
- for (m = dev->mc_list; m; m = m->next) {
- int i;
-
- seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
- dev->name, m->dmi_users, m->dmi_gusers);
-
- for (i = 0; i < m->dmi_addrlen; i++)
- seq_printf(seq, "%02x", m->dmi_addr[i]);
-
- seq_putc(seq, '\n');
- }
- spin_unlock_bh(&dev->xmit_lock);
- return 0;
-}
-
-static struct seq_operations dev_mc_seq_ops = {
- .start = dev_mc_seq_start,
- .next = dev_mc_seq_next,
- .stop = dev_mc_seq_stop,
- .show = dev_mc_seq_show,
-};
-
-static int dev_mc_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &dev_mc_seq_ops);
-}
-
-static struct file_operations dev_mc_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_mc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-#endif
-
-void __init dev_mcast_init(void)
-{
- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops);
-}
-
-EXPORT_SYMBOL(dev_mc_add);
-EXPORT_SYMBOL(dev_mc_delete);
-EXPORT_SYMBOL(dev_mc_upload);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 00000000000..e70301eb7a4
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,439 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/genetlink.h>
+#include <net/netevent.h>
+
+#include <trace/events/skb.h>
+#include <trace/events/napi.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+static int trace_state = TRACE_OFF;
+static DEFINE_MUTEX(trace_state_mutex);
+
+struct per_cpu_dm_data {
+ spinlock_t lock;
+ struct sk_buff *skb;
+ struct work_struct dm_alert_work;
+ struct timer_list send_timer;
+};
+
+struct dm_hw_stat_delta {
+ struct net_device *dev;
+ unsigned long last_rx;
+ struct list_head list;
+ struct rcu_head rcu;
+ unsigned long last_drop_val;
+};
+
+static struct genl_family net_drop_monitor_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
+
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+ size_t al;
+ struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ al = sizeof(struct net_dm_alert_msg);
+ al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
+ skb = genlmsg_new(al, GFP_KERNEL);
+
+ if (skb) {
+ genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ } else {
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+ }
+
+ spin_lock_irqsave(&data->lock, flags);
+ swap(data->skb, skb);
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ return skb;
+}
+
+static struct genl_multicast_group dropmon_mcgrps[] = {
+ { .name = "events", },
+};
+
+static void send_dm_alert(struct work_struct *work)
+{
+ struct sk_buff *skb;
+ struct per_cpu_dm_data *data;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ skb = reset_per_cpu_data(data);
+
+ if (skb)
+ genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+ 0, GFP_KERNEL);
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.
+ */
+static void sched_send_work(unsigned long _data)
+{
+ struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
+
+ schedule_work(&data->dm_alert_work);
+}
+
+static void trace_drop_common(struct sk_buff *skb, void *location)
+{
+ struct net_dm_alert_msg *msg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nla;
+ int i;
+ struct sk_buff *dskb;
+ struct per_cpu_dm_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ data = &__get_cpu_var(dm_cpu_data);
+ spin_lock(&data->lock);
+ dskb = data->skb;
+
+ if (!dskb)
+ goto out;
+
+ nlh = (struct nlmsghdr *)dskb->data;
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
+ for (i = 0; i < msg->entries; i++) {
+ if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+ msg->points[i].count++;
+ goto out;
+ }
+ }
+ if (msg->entries == dm_hit_limit)
+ goto out;
+ /*
+ * We need to create a new entry
+ */
+ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
+ memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+ msg->points[msg->entries].count = 1;
+ msg->entries++;
+
+ if (!timer_pending(&data->send_timer)) {
+ data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&data->send_timer);
+ }
+
+out:
+ spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
+{
+ struct dm_hw_stat_delta *new_stat;
+
+ /*
+ * Don't check napi structures with no associated device
+ */
+ if (!napi->dev)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ /*
+ * only add a note to our monitor buffer if:
+ * 1) this is the dev we received on
+ * 2) its after the last_rx delta
+ * 3) our rx_dropped count has gone up
+ */
+ if ((new_stat->dev == napi->dev) &&
+ (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+ new_stat->last_rx = jiffies;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int set_all_monitor_traces(int state)
+{
+ int rc = 0;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *temp;
+
+ mutex_lock(&trace_state_mutex);
+
+ if (state == trace_state) {
+ rc = -EAGAIN;
+ goto out_unlock;
+ }
+
+ switch (state) {
+ case TRACE_ON:
+ if (!try_module_get(THIS_MODULE)) {
+ rc = -ENODEV;
+ break;
+ }
+
+ rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
+ break;
+
+ case TRACE_OFF:
+ rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /*
+ * Clean the device list
+ */
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ }
+ }
+
+ module_put(THIS_MODULE);
+
+ break;
+ default:
+ rc = 1;
+ break;
+ }
+
+ if (!rc)
+ trace_state = state;
+ else
+ rc = -EINPROGRESS;
+
+out_unlock:
+ mutex_unlock(&trace_state_mutex);
+
+ return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ switch (info->genlhdr->cmd) {
+ case NET_DM_CMD_START:
+ return set_all_monitor_traces(TRACE_ON);
+ break;
+ case NET_DM_CMD_STOP:
+ return set_all_monitor_traces(TRACE_OFF);
+ break;
+ }
+
+ return -ENOTSUPP;
+}
+
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *tmp;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+ if (!new_stat)
+ goto out;
+
+ new_stat->dev = dev;
+ new_stat->last_rx = jiffies;
+ mutex_lock(&trace_state_mutex);
+ list_add_rcu(&new_stat->list, &hw_stats_list);
+ mutex_unlock(&trace_state_mutex);
+ break;
+ case NETDEV_UNREGISTER:
+ mutex_lock(&trace_state_mutex);
+ list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+ if (new_stat->dev == dev) {
+ new_stat->dev = NULL;
+ if (trace_state == TRACE_OFF) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&trace_state_mutex);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static const struct genl_ops dropmon_ops[] = {
+ {
+ .cmd = NET_DM_CMD_CONFIG,
+ .doit = net_dm_cmd_config,
+ },
+ {
+ .cmd = NET_DM_CMD_START,
+ .doit = net_dm_cmd_trace,
+ },
+ {
+ .cmd = NET_DM_CMD_STOP,
+ .doit = net_dm_cmd_trace,
+ },
+};
+
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
+static int __init init_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu, rc;
+
+ pr_info("Initializing network drop monitor service\n");
+
+ if (sizeof(void *) > 8) {
+ pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
+ return -ENOSPC;
+ }
+
+ rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
+ dropmon_ops, dropmon_mcgrps);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
+ return rc;
+ }
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
+
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ pr_crit("Failed to register netdevice notifier\n");
+ goto out_unreg;
+ }
+
+ rc = 0;
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ INIT_WORK(&data->dm_alert_work, send_dm_alert);
+ init_timer(&data->send_timer);
+ data->send_timer.data = (unsigned long)data;
+ data->send_timer.function = sched_send_work;
+ spin_lock_init(&data->lock);
+ reset_per_cpu_data(data);
+ }
+
+
+ goto out;
+
+out_unreg:
+ genl_unregister_family(&net_drop_monitor_family);
+out:
+ return rc;
+}
+
+static void exit_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu;
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+
+ /*
+ * Because of the module_get/put we do in the trace state change path
+ * we are guarnateed not to have any current users when we get here
+ * all we need to do is make sure that we don't have any running timers
+ * or pending schedule calls
+ */
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ /*
+ * At this point, we should have exclusive access
+ * to this struct and can free the skb inside it
+ */
+ kfree_skb(data->skb);
+ }
+
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+}
+
+module_init(init_net_drop_monitor);
+module_exit(exit_net_drop_monitor);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
+MODULE_ALIAS_GENL_FAMILY("NET_DM");
diff --git a/net/core/dst.c b/net/core/dst.c
index 470c05bc4cb..a028409ee43 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -9,60 +9,80 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/netdevice.h>
-#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/sched.h>
+#include <linux/prefetch.h>
#include <net/dst.h>
-/* Locking strategy:
- * 1) Garbage collection state of dead destination cache
- * entries is protected by dst_lock.
- * 2) GC is run only from BH context, and is the only remover
- * of entries.
- * 3) Entries are added to the garbage list from both BH
- * and non-BH context, so local BH disabling is needed.
- * 4) All operations modify state, so a spinlock is used.
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ * new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ * we use a second list where are stored long lived
+ * entries, that are handled by the garbage collect thread
+ * fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ * so that the gc_task and dst_dev_event() can be synchronized.
*/
-static struct dst_entry *dst_garbage_list;
-#if RT_CACHE_DEBUG >= 2
-static atomic_t dst_total = ATOMIC_INIT(0);
-#endif
-static DEFINE_SPINLOCK(dst_lock);
-static unsigned long dst_gc_timer_expires;
-static unsigned long dst_gc_timer_inc = DST_GC_MAX;
-static void dst_run_gc(unsigned long);
-static void ___dst_free(struct dst_entry * dst);
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+static struct {
+ spinlock_t lock;
+ struct dst_entry *list;
+ unsigned long timer_inc;
+ unsigned long timer_expires;
+} dst_garbage = {
+ .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
+ .timer_inc = DST_GC_MAX,
+};
+static void dst_gc_task(struct work_struct *work);
+static void ___dst_free(struct dst_entry *dst);
+
+static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
-static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0);
+static DEFINE_MUTEX(dst_gc_mutex);
+/*
+ * long lived entries are maintained in this list, guarded by dst_gc_mutex
+ */
+static struct dst_entry *dst_busy_list;
-static void dst_run_gc(unsigned long dummy)
+static void dst_gc_task(struct work_struct *work)
{
int delayed = 0;
- int work_performed;
- struct dst_entry * dst, **dstp;
-
- if (!spin_trylock(&dst_lock)) {
- mod_timer(&dst_gc_timer, jiffies + HZ/10);
- return;
- }
-
- del_timer(&dst_gc_timer);
- dstp = &dst_garbage_list;
- work_performed = 0;
- while ((dst = *dstp) != NULL) {
- if (atomic_read(&dst->__refcnt)) {
- dstp = &dst->next;
+ int work_performed = 0;
+ unsigned long expires = ~0L;
+ struct dst_entry *dst, *next, head;
+ struct dst_entry *last = &head;
+
+ mutex_lock(&dst_gc_mutex);
+ next = dst_busy_list;
+
+loop:
+ while ((dst = next) != NULL) {
+ next = dst->next;
+ prefetch(&next->next);
+ cond_resched();
+ if (likely(atomic_read(&dst->__refcnt))) {
+ last->next = dst;
+ last = dst;
delayed++;
continue;
}
- *dstp = dst->next;
- work_performed = 1;
+ work_performed++;
dst = dst_destroy(dst);
if (dst) {
@@ -74,132 +94,160 @@ static void dst_run_gc(unsigned long dummy)
* But we do not have state "obsoleted, but
* referenced by parent", so it is right.
*/
- if (dst->obsolete > 1)
+ if (dst->obsolete > 0)
continue;
___dst_free(dst);
- dst->next = *dstp;
- *dstp = dst;
- dstp = &dst->next;
+ dst->next = next;
+ next = dst;
}
}
- if (!dst_garbage_list) {
- dst_gc_timer_inc = DST_GC_MAX;
- goto out;
+
+ spin_lock_bh(&dst_garbage.lock);
+ next = dst_garbage.list;
+ if (next) {
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+ goto loop;
}
- if (!work_performed) {
- if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
- dst_gc_timer_expires = DST_GC_MAX;
- dst_gc_timer_inc += DST_GC_INC;
- } else {
- dst_gc_timer_inc = DST_GC_INC;
- dst_gc_timer_expires = DST_GC_MIN;
+ last->next = NULL;
+ dst_busy_list = head.next;
+ if (!dst_busy_list)
+ dst_garbage.timer_inc = DST_GC_MAX;
+ else {
+ /*
+ * if we freed less than 1/10 of delayed entries,
+ * we can sleep longer.
+ */
+ if (work_performed <= delayed/10) {
+ dst_garbage.timer_expires += dst_garbage.timer_inc;
+ if (dst_garbage.timer_expires > DST_GC_MAX)
+ dst_garbage.timer_expires = DST_GC_MAX;
+ dst_garbage.timer_inc += DST_GC_INC;
+ } else {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ }
+ expires = dst_garbage.timer_expires;
+ /*
+ * if the next desired timer is more than 4 seconds in the
+ * future then round the timer to whole seconds
+ */
+ if (expires > 4*HZ)
+ expires = round_jiffies_relative(expires);
+ schedule_delayed_work(&dst_gc_work, expires);
}
- dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
-#if RT_CACHE_DEBUG >= 2
- printk("dst_total: %d/%d %ld\n",
- atomic_read(&dst_total), delayed, dst_gc_timer_expires);
-#endif
- add_timer(&dst_gc_timer);
-out:
- spin_unlock(&dst_lock);
+ spin_unlock_bh(&dst_garbage.lock);
+ mutex_unlock(&dst_gc_mutex);
}
-static int dst_discard_in(struct sk_buff *skb)
+int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL(dst_discard_sk);
-static int dst_discard_out(struct sk_buff *skb)
-{
- kfree_skb(skb);
- return 0;
-}
+const u32 dst_default_metrics[RTAX_MAX + 1] = {
+ /* This initializer is needed to force linker to place this variable
+ * into const section. Otherwise it might end into bss section.
+ * We really want to avoid false sharing on this variable, and catch
+ * any writes on it.
+ */
+ [RTAX_MAX] = 0xdeadbeef,
+};
-void * dst_alloc(struct dst_ops * ops)
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
{
- struct dst_entry * dst;
+ struct dst_entry *dst;
- if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
- if (ops->gc())
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
return NULL;
}
- dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC);
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
- memset(dst, 0, ops->entry_size);
- atomic_set(&dst->__refcnt, 0);
+ dst->child = NULL;
+ dst->dev = dev;
+ if (dev)
+ dev_hold(dev);
dst->ops = ops;
- dst->lastuse = jiffies;
+ dst_init_metrics(dst, dst_default_metrics, true);
+ dst->expires = 0UL;
dst->path = dst;
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
-#if RT_CACHE_DEBUG >= 2
- atomic_inc(&dst_total);
+ dst->from = NULL;
+#ifdef CONFIG_XFRM
+ dst->xfrm = NULL;
#endif
- atomic_inc(&ops->entries);
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ dst->error = 0;
+ dst->obsolete = initial_obsolete;
+ dst->header_len = 0;
+ dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->tclassid = 0;
+#endif
+ atomic_set(&dst->__refcnt, initial_ref);
+ dst->__use = 0;
+ dst->lastuse = jiffies;
+ dst->flags = flags;
+ dst->pending_confirm = 0;
+ dst->next = NULL;
+ if (!(flags & DST_NOCOUNT))
+ dst_entries_add(ops, 1);
return dst;
}
+EXPORT_SYMBOL(dst_alloc);
-static void ___dst_free(struct dst_entry * dst)
+static void ___dst_free(struct dst_entry *dst)
{
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
}
- dst->obsolete = 2;
+ dst->obsolete = DST_OBSOLETE_DEAD;
}
-void __dst_free(struct dst_entry * dst)
+void __dst_free(struct dst_entry *dst)
{
- spin_lock_bh(&dst_lock);
+ spin_lock_bh(&dst_garbage.lock);
___dst_free(dst);
- dst->next = dst_garbage_list;
- dst_garbage_list = dst;
- if (dst_gc_timer_inc > DST_GC_INC) {
- dst_gc_timer_inc = DST_GC_INC;
- dst_gc_timer_expires = DST_GC_MIN;
- mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+ dst->next = dst_garbage.list;
+ dst_garbage.list = dst;
+ if (dst_garbage.timer_inc > DST_GC_INC) {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
}
- spin_unlock_bh(&dst_lock);
+ spin_unlock_bh(&dst_garbage.lock);
}
+EXPORT_SYMBOL(__dst_free);
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
- struct neighbour *neigh;
- struct hh_cache *hh;
smp_rmb();
again:
- neigh = dst->neighbour;
- hh = dst->hh;
child = dst->child;
- dst->hh = NULL;
- if (hh && atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
-
- if (neigh) {
- dst->neighbour = NULL;
- neigh_release(neigh);
- }
-
- atomic_dec(&dst->ops->entries);
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
if (dst->ops->destroy)
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
-#if RT_CACHE_DEBUG >= 2
- atomic_dec(&dst_total);
-#endif
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
@@ -219,6 +267,89 @@ again:
}
return NULL;
}
+EXPORT_SYMBOL(dst_destroy);
+
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+}
+
+void dst_release(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ WARN_ON(newrefcnt < 0);
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
+ }
+}
+EXPORT_SYMBOL(dst_release);
+
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
+/**
+ * __skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ * @force: if force is set, use noref version even for DST_NOCACHE entries
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
+{
+ WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ /* If dst not in cache, we must take a reference, because
+ * dst_release() will destroy dst as soon as its refcount becomes zero
+ */
+ if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
+ dst_hold(dst);
+ skb_dst_set(skb, dst);
+ } else {
+ skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+ }
+}
+EXPORT_SYMBOL(__skb_dst_set_noref);
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
@@ -228,8 +359,8 @@ again:
*
* Commented and originally written by Alexey.
*/
-static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
+static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
{
if (dst->ops->ifdown)
dst->ops->ifdown(dst, dev, unregister);
@@ -238,33 +369,42 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
return;
if (!unregister) {
- dst->input = dst_discard_in;
- dst->output = dst_discard_out;
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
} else {
- dst->dev = &loopback_dev;
- dev_hold(&loopback_dev);
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
dev_put(dev);
- if (dst->neighbour && dst->neighbour->dev == dev) {
- dst->neighbour->dev = &loopback_dev;
- dev_put(dev);
- dev_hold(&loopback_dev);
- }
}
}
-static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int dst_dev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
- struct net_device *dev = ptr;
- struct dst_entry *dst;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dst_entry *dst, *last = NULL;
switch (event) {
- case NETDEV_UNREGISTER:
+ case NETDEV_UNREGISTER_FINAL:
case NETDEV_DOWN:
- spin_lock_bh(&dst_lock);
- for (dst = dst_garbage_list; dst; dst = dst->next) {
+ mutex_lock(&dst_gc_mutex);
+ for (dst = dst_busy_list; dst; dst = dst->next) {
+ last = dst;
dst_ifdown(dst, dev, event != NETDEV_DOWN);
}
- spin_unlock_bh(&dst_lock);
+
+ spin_lock_bh(&dst_garbage.lock);
+ dst = dst_garbage.list;
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+
+ if (last)
+ last->next = dst;
+ else
+ dst_busy_list = dst;
+ for (; dst; dst = dst->next)
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ mutex_unlock(&dst_gc_mutex);
break;
}
return NOTIFY_DONE;
@@ -272,13 +412,10 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void
static struct notifier_block dst_dev_notifier = {
.notifier_call = dst_dev_event,
+ .priority = -10, /* must be called after other network notifiers */
};
void __init dst_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}
-
-EXPORT_SYMBOL(__dst_free);
-EXPORT_SYMBOL(dst_alloc);
-EXPORT_SYMBOL(dst_destroy);
diff --git a/net/core/dv.c b/net/core/dv.c
deleted file mode 100644
index cf581407538..00000000000
--- a/net/core/dv.c
+++ /dev/null
@@ -1,549 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * Generic frame diversion
- *
- * Authors:
- * Benoit LOCHER: initial integration within the kernel with support for ethernet
- * Dave Miller: improvement on the code (correctness, performance and source files)
- *
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <net/dst.h>
-#include <net/arp.h>
-#include <net/sock.h>
-#include <net/ipv6.h>
-#include <net/ip.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/checksum.h>
-#include <linux/divert.h>
-#include <linux/sockios.h>
-
-const char sysctl_divert_version[32]="0.46"; /* Current version */
-
-static int __init dv_init(void)
-{
- return 0;
-}
-module_init(dv_init);
-
-/*
- * Allocate a divert_blk for a device. This must be an ethernet nic.
- */
-int alloc_divert_blk(struct net_device *dev)
-{
- int alloc_size = (sizeof(struct divert_blk) + 3) & ~3;
-
- dev->divert = NULL;
- if (dev->type == ARPHRD_ETHER) {
- dev->divert = (struct divert_blk *)
- kmalloc(alloc_size, GFP_KERNEL);
- if (dev->divert == NULL) {
- printk(KERN_INFO "divert: unable to allocate divert_blk for %s\n",
- dev->name);
- return -ENOMEM;
- }
-
- memset(dev->divert, 0, sizeof(struct divert_blk));
- dev_hold(dev);
- }
-
- return 0;
-}
-
-/*
- * Free a divert_blk allocated by the above function, if it was
- * allocated on that device.
- */
-void free_divert_blk(struct net_device *dev)
-{
- if (dev->divert) {
- kfree(dev->divert);
- dev->divert=NULL;
- dev_put(dev);
- }
-}
-
-/*
- * Adds a tcp/udp (source or dest) port to an array
- */
-static int add_port(u16 ports[], u16 port)
-{
- int i;
-
- if (port == 0)
- return -EINVAL;
-
- /* Storing directly in network format for performance,
- * thanks Dave :)
- */
- port = htons(port);
-
- for (i = 0; i < MAX_DIVERT_PORTS; i++) {
- if (ports[i] == port)
- return -EALREADY;
- }
-
- for (i = 0; i < MAX_DIVERT_PORTS; i++) {
- if (ports[i] == 0) {
- ports[i] = port;
- return 0;
- }
- }
-
- return -ENOBUFS;
-}
-
-/*
- * Removes a port from an array tcp/udp (source or dest)
- */
-static int remove_port(u16 ports[], u16 port)
-{
- int i;
-
- if (port == 0)
- return -EINVAL;
-
- /* Storing directly in network format for performance,
- * thanks Dave !
- */
- port = htons(port);
-
- for (i = 0; i < MAX_DIVERT_PORTS; i++) {
- if (ports[i] == port) {
- ports[i] = 0;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-
-/* Some basic sanity checks on the arguments passed to divert_ioctl() */
-static int check_args(struct divert_cf *div_cf, struct net_device **dev)
-{
- char devname[32];
- int ret;
-
- if (dev == NULL)
- return -EFAULT;
-
- /* GETVERSION: all other args are unused */
- if (div_cf->cmd == DIVCMD_GETVERSION)
- return 0;
-
- /* Network device index should reasonably be between 0 and 1000 :) */
- if (div_cf->dev_index < 0 || div_cf->dev_index > 1000)
- return -EINVAL;
-
- /* Let's try to find the ifname */
- sprintf(devname, "eth%d", div_cf->dev_index);
- *dev = dev_get_by_name(devname);
-
- /* dev should NOT be null */
- if (*dev == NULL)
- return -EINVAL;
-
- ret = 0;
-
- /* user issuing the ioctl must be a super one :) */
- if (!capable(CAP_SYS_ADMIN)) {
- ret = -EPERM;
- goto out;
- }
-
- /* Device must have a divert_blk member NOT null */
- if ((*dev)->divert == NULL)
- ret = -EINVAL;
-out:
- dev_put(*dev);
- return ret;
-}
-
-/*
- * control function of the diverter
- */
-#if 0
-#define DVDBG(a) \
- printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a))
-#else
-#define DVDBG(a)
-#endif
-
-int divert_ioctl(unsigned int cmd, struct divert_cf __user *arg)
-{
- struct divert_cf div_cf;
- struct divert_blk *div_blk;
- struct net_device *dev;
- int ret;
-
- switch (cmd) {
- case SIOCGIFDIVERT:
- DVDBG("SIOCGIFDIVERT, copy_from_user");
- if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
- return -EFAULT;
- DVDBG("before check_args");
- ret = check_args(&div_cf, &dev);
- if (ret)
- return ret;
- DVDBG("after checkargs");
- div_blk = dev->divert;
-
- DVDBG("befre switch()");
- switch (div_cf.cmd) {
- case DIVCMD_GETSTATUS:
- /* Now, just give the user the raw divert block
- * for him to play with :)
- */
- if (copy_to_user(div_cf.arg1.ptr, dev->divert,
- sizeof(struct divert_blk)))
- return -EFAULT;
- break;
-
- case DIVCMD_GETVERSION:
- DVDBG("GETVERSION: checking ptr");
- if (div_cf.arg1.ptr == NULL)
- return -EINVAL;
- DVDBG("GETVERSION: copying data to userland");
- if (copy_to_user(div_cf.arg1.ptr,
- sysctl_divert_version, 32))
- return -EFAULT;
- DVDBG("GETVERSION: data copied");
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case SIOCSIFDIVERT:
- if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
- return -EFAULT;
-
- ret = check_args(&div_cf, &dev);
- if (ret)
- return ret;
-
- div_blk = dev->divert;
-
- switch(div_cf.cmd) {
- case DIVCMD_RESET:
- div_blk->divert = 0;
- div_blk->protos = DIVERT_PROTO_NONE;
- memset(div_blk->tcp_dst, 0,
- MAX_DIVERT_PORTS * sizeof(u16));
- memset(div_blk->tcp_src, 0,
- MAX_DIVERT_PORTS * sizeof(u16));
- memset(div_blk->udp_dst, 0,
- MAX_DIVERT_PORTS * sizeof(u16));
- memset(div_blk->udp_src, 0,
- MAX_DIVERT_PORTS * sizeof(u16));
- return 0;
-
- case DIVCMD_DIVERT:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ENABLE:
- if (div_blk->divert)
- return -EALREADY;
- div_blk->divert = 1;
- break;
-
- case DIVARG1_DISABLE:
- if (!div_blk->divert)
- return -EALREADY;
- div_blk->divert = 0;
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_IP:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ENABLE:
- if (div_blk->protos & DIVERT_PROTO_IP)
- return -EALREADY;
- div_blk->protos |= DIVERT_PROTO_IP;
- break;
-
- case DIVARG1_DISABLE:
- if (!(div_blk->protos & DIVERT_PROTO_IP))
- return -EALREADY;
- div_blk->protos &= ~DIVERT_PROTO_IP;
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_TCP:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ENABLE:
- if (div_blk->protos & DIVERT_PROTO_TCP)
- return -EALREADY;
- div_blk->protos |= DIVERT_PROTO_TCP;
- break;
-
- case DIVARG1_DISABLE:
- if (!(div_blk->protos & DIVERT_PROTO_TCP))
- return -EALREADY;
- div_blk->protos &= ~DIVERT_PROTO_TCP;
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_TCPDST:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ADD:
- return add_port(div_blk->tcp_dst,
- div_cf.arg2.uint16);
-
- case DIVARG1_REMOVE:
- return remove_port(div_blk->tcp_dst,
- div_cf.arg2.uint16);
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_TCPSRC:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ADD:
- return add_port(div_blk->tcp_src,
- div_cf.arg2.uint16);
-
- case DIVARG1_REMOVE:
- return remove_port(div_blk->tcp_src,
- div_cf.arg2.uint16);
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_UDP:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ENABLE:
- if (div_blk->protos & DIVERT_PROTO_UDP)
- return -EALREADY;
- div_blk->protos |= DIVERT_PROTO_UDP;
- break;
-
- case DIVARG1_DISABLE:
- if (!(div_blk->protos & DIVERT_PROTO_UDP))
- return -EALREADY;
- div_blk->protos &= ~DIVERT_PROTO_UDP;
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_UDPDST:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ADD:
- return add_port(div_blk->udp_dst,
- div_cf.arg2.uint16);
-
- case DIVARG1_REMOVE:
- return remove_port(div_blk->udp_dst,
- div_cf.arg2.uint16);
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_UDPSRC:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ADD:
- return add_port(div_blk->udp_src,
- div_cf.arg2.uint16);
-
- case DIVARG1_REMOVE:
- return remove_port(div_blk->udp_src,
- div_cf.arg2.uint16);
-
- default:
- return -EINVAL;
- }
-
- break;
-
- case DIVCMD_ICMP:
- switch(div_cf.arg1.int32) {
- case DIVARG1_ENABLE:
- if (div_blk->protos & DIVERT_PROTO_ICMP)
- return -EALREADY;
- div_blk->protos |= DIVERT_PROTO_ICMP;
- break;
-
- case DIVARG1_DISABLE:
- if (!(div_blk->protos & DIVERT_PROTO_ICMP))
- return -EALREADY;
- div_blk->protos &= ~DIVERT_PROTO_ICMP;
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- default:
- return -EINVAL;
- }
-
- break;
-
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-/*
- * Check if packet should have its dest mac address set to the box itself
- * for diversion
- */
-
-#define ETH_DIVERT_FRAME(skb) \
- memcpy(eth_hdr(skb), skb->dev->dev_addr, ETH_ALEN); \
- skb->pkt_type=PACKET_HOST
-
-void divert_frame(struct sk_buff *skb)
-{
- struct ethhdr *eth = eth_hdr(skb);
- struct iphdr *iph;
- struct tcphdr *tcph;
- struct udphdr *udph;
- struct divert_blk *divert = skb->dev->divert;
- int i, src, dst;
- unsigned char *skb_data_end = skb->data + skb->len;
-
- /* Packet is already aimed at us, return */
- if (!compare_ether_addr(eth->h_dest, skb->dev->dev_addr))
- return;
-
- /* proto is not IP, do nothing */
- if (eth->h_proto != htons(ETH_P_IP))
- return;
-
- /* Divert all IP frames ? */
- if (divert->protos & DIVERT_PROTO_IP) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
-
- /* Check for possible (maliciously) malformed IP frame (thanks Dave) */
- iph = (struct iphdr *) skb->data;
- if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) {
- printk(KERN_INFO "divert: malformed IP packet !\n");
- return;
- }
-
- switch (iph->protocol) {
- /* Divert all ICMP frames ? */
- case IPPROTO_ICMP:
- if (divert->protos & DIVERT_PROTO_ICMP) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
- break;
-
- /* Divert all TCP frames ? */
- case IPPROTO_TCP:
- if (divert->protos & DIVERT_PROTO_TCP) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
-
- /* Check for possible (maliciously) malformed IP
- * frame (thanx Dave)
- */
- tcph = (struct tcphdr *)
- (((unsigned char *)iph) + (iph->ihl<<2));
- if (((unsigned char *)(tcph+1)) >= skb_data_end) {
- printk(KERN_INFO "divert: malformed TCP packet !\n");
- return;
- }
-
- /* Divert some tcp dst/src ports only ?*/
- for (i = 0; i < MAX_DIVERT_PORTS; i++) {
- dst = divert->tcp_dst[i];
- src = divert->tcp_src[i];
- if ((dst && dst == tcph->dest) ||
- (src && src == tcph->source)) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
- }
- break;
-
- /* Divert all UDP frames ? */
- case IPPROTO_UDP:
- if (divert->protos & DIVERT_PROTO_UDP) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
-
- /* Check for possible (maliciously) malformed IP
- * packet (thanks Dave)
- */
- udph = (struct udphdr *)
- (((unsigned char *)iph) + (iph->ihl<<2));
- if (((unsigned char *)(udph+1)) >= skb_data_end) {
- printk(KERN_INFO
- "divert: malformed UDP packet !\n");
- return;
- }
-
- /* Divert some udp dst/src ports only ? */
- for (i = 0; i < MAX_DIVERT_PORTS; i++) {
- dst = divert->udp_dst[i];
- src = divert->udp_src[i];
- if ((dst && dst == udph->dest) ||
- (src && src == udph->source)) {
- ETH_DIVERT_FRAME(skb);
- return;
- }
- }
- break;
- }
-}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e6f76106a99..17cb912793f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -3,10 +3,12 @@
* Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
*
* This file is where we call all the ethtool_ops commands to get
- * the information ethtool needs. We fall back to calling do_ioctl()
- * for drivers which haven't been converted to ethtool_ops yet.
+ * the information ethtool needs.
*
- * It's GPL, stupid.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
*/
#include <linux/module.h>
@@ -15,9 +17,16 @@
#include <linux/errno.h>
#include <linux/ethtool.h>
#include <linux/netdevice.h>
-#include <asm/uaccess.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
-/*
+/*
* Some useful ethtool_ops methods that're device independent.
* If we find that all drivers want to do the same thing here,
* we can turn these into dev_() function calls.
@@ -27,98 +36,317 @@ u32 ethtool_op_get_link(struct net_device *dev)
{
return netif_carrier_ok(dev) ? 1 : 0;
}
+EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
+int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
{
- return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
+ info->so_timestamping =
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ return 0;
}
+EXPORT_SYMBOL(ethtool_op_get_ts_info);
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
+/* Handlers for each ethtool command */
+
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
+ [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+};
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
- if (data)
- dev->features |= NETIF_F_IP_CSUM;
- else
- dev->features &= ~NETIF_F_IP_CSUM;
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 __user *sizeaddr;
+ u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
return 0;
}
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
{
- if (data)
- dev->features |= NETIF_F_HW_CSUM;
- else
- dev->features &= ~NETIF_F_HW_CSUM;
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
- return 0;
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
+
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
+ __netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
}
-u32 ethtool_op_get_sg(struct net_device *dev)
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
- return (dev->features & NETIF_F_SG) != 0;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
}
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
{
- if (data)
- dev->features |= NETIF_F_SG;
- else
- dev->features &= ~NETIF_F_SG;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
- return 0;
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
}
-u32 ethtool_op_get_tso(struct net_device *dev)
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
{
- return (dev->features & NETIF_F_TSO) != 0;
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GUFO:
+ case ETHTOOL_SUFO:
+ return NETIF_F_UFO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
}
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
{
- if (data)
- dev->features |= NETIF_F_TSO;
- else
- dev->features &= ~NETIF_F_TSO;
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
return 0;
}
-int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
{
- unsigned char len = dev->addr_len;
- if ( addr->size < len )
- return -ETOOSMALL;
-
- addr->size = len;
- memcpy(data, dev->perm_addr, len);
+ struct ethtool_value edata;
+ netdev_features_t mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (!mask)
+ return -EOPNOTSUPP;
+
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ __netdev_update_features(dev);
+
return 0;
}
-
-u32 ethtool_op_get_ufo(struct net_device *dev)
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
{
- return (dev->features & NETIF_F_UFO) != 0;
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
}
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
{
- if (data)
- dev->features |= NETIF_F_UFO;
- else
- dev->features &= ~NETIF_F_UFO;
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
return 0;
}
-/* Handlers for each ethtool command */
-
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
- struct ethtool_cmd cmd = { ETHTOOL_GSET };
- int err;
+ ASSERT_RTNL();
if (!dev->ethtool_ops->get_settings)
return -EOPNOTSUPP;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
+ memset(cmd, 0, sizeof(struct ethtool_cmd));
+ cmd->cmd = ETHTOOL_GSET;
+ return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(__ethtool_get_settings);
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
+ err = __ethtool_get_settings(dev, &cmd);
if (err < 0)
return err;
@@ -140,22 +368,42 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_settings(dev, &cmd);
}
-static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
+ void __user *useraddr)
{
struct ethtool_drvinfo info;
- struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops->get_drvinfo)
- return -EOPNOTSUPP;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GDRVINFO;
- ops->get_drvinfo(dev, &info);
+ if (ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &info);
+ } else if (dev->dev.parent && dev->dev.parent->driver) {
+ strlcpy(info.bus_info, dev_name(dev->dev.parent),
+ sizeof(info.bus_info));
+ strlcpy(info.driver, dev->dev.parent->driver->name,
+ sizeof(info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
- if (ops->self_test_count)
- info.testinfo_len = ops->self_test_count(dev);
- if (ops->get_stats_count)
- info.n_stats = ops->get_stats_count(dev);
+ /*
+ * this method of obtaining string set info is deprecated;
+ * Use ETHTOOL_GSSET_INFO instead.
+ */
+ if (ops->get_sset_count) {
+ int rc;
+
+ rc = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (rc >= 0)
+ info.testinfo_len = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (rc >= 0)
+ info.n_stats = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (rc >= 0)
+ info.n_priv_flags = rc;
+ }
if (ops->get_regs_len)
info.regdump_len = ops->get_regs_len(dev);
if (ops->get_eeprom_len)
@@ -166,10 +414,438 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
return 0;
}
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_sset_info info;
+ u64 sset_mask;
+ int i, idx = 0, n_bits = 0, ret, rc;
+ u32 *info_buf = NULL;
+
+ if (copy_from_user(&info, useraddr, sizeof(info)))
+ return -EFAULT;
+
+ /* store copy of mask, because we zero struct later on */
+ sset_mask = info.sset_mask;
+ if (!sset_mask)
+ return 0;
+
+ /* calculate size of return buffer */
+ n_bits = hweight64(sset_mask);
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GSSET_INFO;
+
+ info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /*
+ * fill return buffer based on input bitmask and successful
+ * get_sset_count return
+ */
+ for (i = 0; i < 64; i++) {
+ if (!(sset_mask & (1ULL << i)))
+ continue;
+
+ rc = __ethtool_get_sset_count(dev, i);
+ if (rc >= 0) {
+ info.sset_mask |= (1ULL << i);
+ info_buf[idx++] = rc;
+ }
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ goto out;
+
+ useraddr += offsetof(struct ethtool_sset_info, data);
+ if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+ goto out;
+
+ ret = 0;
+
+out:
+ kfree(info_buf);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!dev->ethtool_ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_SRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_GRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
+ GFP_USER);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, info_size))
+ goto err_out;
+
+ if (rule_buf) {
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ if (copy_to_user(useraddr, rule_buf,
+ info.rule_cnt * sizeof(u32)))
+ goto err_out;
+ }
+ ret = 0;
+
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ u32 user_size, dev_size;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
+ ret = -EFAULT;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
+
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
+ !ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ dev_size = ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ops->set_rxfh(dev, indir, NULL);
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!(dev->ethtool_ops->get_rxfh_indir_size ||
+ dev->ethtool_ops->get_rxfh_key_size) ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size and key size. Otherwise, if the User size is
+ * not equal to device table size or key size it's an error.
+ */
+ if (!user_indir_size && !user_key_size)
+ return 0;
+
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
+ if (!ret) {
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size))
+ ret = -EFAULT;
+ }
+
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+ if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
+ !ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ /* If either indir or hash key is valid, proceed further.
+ * It is not valid to request that both be unchanged.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0))
+ return -EINVAL;
+
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ /* rxfh.indir_size == 0 means reset the indir table to default.
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
+ goto out;
+ } else if (rxfh.indir_size == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ }
+
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ ret = ops->set_rxfh(dev, indir, hkey);
+
+out:
+ kfree(rss_config);
+ return ret;
+}
+
static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
{
struct ethtool_regs regs;
- struct ethtool_ops *ops = dev->ethtool_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
void *regbuf;
int reglen, ret;
@@ -183,8 +859,8 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = kmalloc(reglen, GFP_USER);
- if (!regbuf)
+ regbuf = vzalloc(reglen);
+ if (reglen && !regbuf)
return -ENOMEM;
ops->get_regs(dev, &regs, regbuf);
@@ -193,18 +869,38 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (copy_to_user(useraddr, &regs, sizeof(regs)))
goto out;
useraddr += offsetof(struct ethtool_regs, data);
- if (copy_to_user(useraddr, regbuf, regs.len))
+ if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
goto out;
ret = 0;
out:
- kfree(regbuf);
+ vfree(regbuf);
return ret;
}
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value reset;
+ int ret;
+
+ if (!dev->ethtool_ops->reset)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&reset, useraddr, sizeof(reset)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->reset(dev, &reset.data);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &reset, sizeof(reset)))
+ return -EFAULT;
+ return 0;
+}
+
static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
+ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
if (!dev->ethtool_ops->get_wol)
return -EOPNOTSUPP;
@@ -229,32 +925,38 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
return dev->ethtool_ops->set_wol(dev, &wol);
}
-static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GMSGLVL };
+ struct ethtool_eee edata;
+ int rc;
- if (!dev->ethtool_ops->get_msglevel)
+ if (!dev->ethtool_ops->get_eee)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_msglevel(dev);
+ memset(&edata, 0, sizeof(struct ethtool_eee));
+ edata.cmd = ETHTOOL_GEEE;
+ rc = dev->ethtool_ops->get_eee(dev, &edata);
+
+ if (rc)
+ return rc;
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
+
return 0;
}
-static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr)
+static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_value edata;
+ struct ethtool_eee edata;
- if (!dev->ethtool_ops->set_msglevel)
+ if (!dev->ethtool_ops->set_eee)
return -EOPNOTSUPP;
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
- dev->ethtool_ops->set_msglevel(dev, edata.data);
- return 0;
+ return dev->ethtool_ops->set_eee(dev, &edata);
}
static int ethtool_nway_reset(struct net_device *dev)
@@ -265,29 +967,30 @@ static int ethtool_nway_reset(struct net_device *dev)
return dev->ethtool_ops->nway_reset(dev);
}
-static int ethtool_get_link(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GLINK };
+ struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
if (!dev->ethtool_ops->get_link)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_link(dev);
+ edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
return 0;
}
-static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
+ int (*getter)(struct net_device *,
+ struct ethtool_eeprom *, u8 *),
+ u32 total_len)
{
struct ethtool_eeprom eeprom;
- struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
u8 *data;
- int ret;
-
- if (!ops->get_eeprom || !ops->get_eeprom_len)
- return -EOPNOTSUPP;
+ int ret = 0;
if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
return -EFAULT;
@@ -297,39 +1000,57 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
return -EINVAL;
/* Check for exceeding total eeprom len */
- if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ if (eeprom.offset + eeprom.len > total_len)
return -EINVAL;
- data = kmalloc(eeprom.len, GFP_USER);
+ data = kmalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
- ret = -EFAULT;
- if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
- goto out;
-
- ret = ops->get_eeprom(dev, &eeprom, data);
- if (ret)
- goto out;
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ ret = getter(dev, &eeprom, data);
+ if (ret)
+ break;
+ if (copy_to_user(userbuf, data, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
- ret = -EFAULT;
+ eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+ eeprom.offset -= eeprom.len;
if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
- goto out;
- if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
- goto out;
- ret = 0;
+ ret = -EFAULT;
- out:
kfree(data);
return ret;
}
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len)
+ return -EOPNOTSUPP;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
+ ops->get_eeprom_len(dev));
+}
+
static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
{
struct ethtool_eeprom eeprom;
- struct ethtool_ops *ops = dev->ethtool_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
u8 *data;
- int ret;
+ int ret = 0;
if (!ops->set_eeprom || !ops->get_eeprom_len)
return -EOPNOTSUPP;
@@ -345,29 +1066,34 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
return -EINVAL;
- data = kmalloc(eeprom.len, GFP_USER);
+ data = kmalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
- ret = -EFAULT;
- if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
- goto out;
-
- ret = ops->set_eeprom(dev, &eeprom, data);
- if (ret)
- goto out;
-
- if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
- ret = -EFAULT;
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ if (copy_from_user(data, userbuf, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = ops->set_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
- out:
kfree(data);
return ret;
}
-static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
if (!dev->ethtool_ops->get_coalesce)
return -EOPNOTSUPP;
@@ -379,7 +1105,8 @@ static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
return 0;
}
-static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
+ void __user *useraddr)
{
struct ethtool_coalesce coalesce;
@@ -394,7 +1121,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
+ struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
if (!dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
@@ -419,6 +1146,35 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_ringparam(dev, &ringparam);
}
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ if (copy_to_user(useraddr, &channels, sizeof(channels)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels;
+
+ if (!dev->ethtool_ops->set_channels)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&channels, useraddr, sizeof(channels)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_channels(dev, &channels);
+}
+
static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
@@ -437,7 +1193,7 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_pauseparam pauseparam;
- if (!dev->ethtool_ops->get_pauseparam)
+ if (!dev->ethtool_ops->set_pauseparam)
return -EOPNOTSUPP;
if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
@@ -446,364 +1202,485 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
}
-static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr)
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GRXCSUM };
+ struct ethtool_test test;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, test_len;
- if (!dev->ethtool_ops->get_rx_csum)
+ if (!ops->self_test || !ops->get_sset_count)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_rx_csum(dev);
+ test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (test_len < 0)
+ return test_len;
+ WARN_ON(test_len == 0);
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ if (copy_from_user(&test, useraddr, sizeof(test)))
return -EFAULT;
- return 0;
+
+ test.len = test_len;
+ data = kmalloc(test_len * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->self_test(dev, &test, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &test, sizeof(test)))
+ goto out;
+ useraddr += sizeof(test);
+ if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
}
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_rx_csum)
- return -EOPNOTSUPP;
+ struct ethtool_gstrings gstrings;
+ u8 *data;
+ int ret;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
return -EFAULT;
- dev->ethtool_ops->set_rx_csum(dev, edata.data);
- return 0;
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
+ if (ret < 0)
+ return ret;
+
+ gstrings.len = ret;
+
+ data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ __ethtool_get_strings(dev, gstrings.string_set, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+ goto out;
+ useraddr += sizeof(gstrings);
+ if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
}
-static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr)
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GTXCSUM };
+ struct ethtool_value id;
+ static bool busy;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int rc;
- if (!dev->ethtool_ops->get_tx_csum)
+ if (!ops->set_phys_id)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_tx_csum(dev);
+ if (busy)
+ return -EBUSY;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ if (copy_from_user(&id, useraddr, sizeof(id)))
return -EFAULT;
- return 0;
-}
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
- int err;
+ rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+ if (rc < 0)
+ return rc;
- if (!data && dev->ethtool_ops->set_tso) {
- err = dev->ethtool_ops->set_tso(dev, 0);
- if (err)
- return err;
+ /* Drop the RTNL lock while waiting, but prevent reentry or
+ * removal of the device.
+ */
+ busy = true;
+ dev_hold(dev);
+ rtnl_unlock();
+
+ if (rc == 0) {
+ /* Driver will handle this itself */
+ schedule_timeout_interruptible(
+ id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+ } else {
+ /* Driver expects to be called at twice the frequency in rc */
+ int n = rc * 2, i, interval = HZ / n;
+
+ /* Count down seconds */
+ do {
+ /* Count down iterations per second */
+ i = n;
+ do {
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && --i != 0);
+ } while (!signal_pending(current) &&
+ (id.data == 0 || --id.data != 0));
}
- if (!data && dev->ethtool_ops->set_ufo) {
- err = dev->ethtool_ops->set_ufo(dev, 0);
- if (err)
- return err;
- }
- return dev->ethtool_ops->set_sg(dev, data);
+ rtnl_lock();
+ dev_put(dev);
+ busy = false;
+
+ (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+ return rc;
}
-static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata;
- int err;
+ struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, n_stats;
- if (!dev->ethtool_ops->set_tx_csum)
+ if (!ops->get_ethtool_stats || !ops->get_sset_count)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (n_stats < 0)
+ return n_stats;
+ WARN_ON(n_stats == 0);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
- if (!edata.data && dev->ethtool_ops->set_sg) {
- err = __ethtool_set_sg(dev, 0);
- if (err)
- return err;
- }
+ stats.n_stats = n_stats;
+ data = kmalloc(n_stats * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->get_ethtool_stats(dev, &stats, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
- return dev->ethtool_ops->set_tx_csum(dev, edata.data);
+ out:
+ kfree(data);
+ return ret;
}
-static int ethtool_get_sg(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GSG };
+ struct ethtool_perm_addr epaddr;
- if (!dev->ethtool_ops->get_sg)
- return -EOPNOTSUPP;
+ if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+ return -EFAULT;
- edata.data = dev->ethtool_ops->get_sg(dev);
+ if (epaddr.size < dev->addr_len)
+ return -ETOOSMALL;
+ epaddr.size = dev->addr_len;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ return -EFAULT;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
return -EFAULT;
return 0;
}
-static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
{
- struct ethtool_value edata;
+ struct ethtool_value edata = { .cmd = cmd };
- if (!dev->ethtool_ops->set_sg)
+ if (!actor)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data &&
- !(dev->features & (NETIF_F_IP_CSUM |
- NETIF_F_NO_CSUM |
- NETIF_F_HW_CSUM)))
- return -EINVAL;
+ edata.data = actor(dev);
- return __ethtool_set_sg(dev, edata.data);
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
}
-static int ethtool_get_tso(struct net_device *dev, char __user *useraddr)
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+ void (*actor)(struct net_device *, u32))
{
- struct ethtool_value edata = { ETHTOOL_GTSO };
+ struct ethtool_value edata;
- if (!dev->ethtool_ops->get_tso)
+ if (!actor)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_tso(dev);
-
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
+
+ actor(dev, edata.data);
return 0;
}
-static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+ int (*actor)(struct net_device *, u32))
{
struct ethtool_value edata;
- if (!dev->ethtool_ops->set_tso)
+ if (!actor)
return -EOPNOTSUPP;
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
-
- return dev->ethtool_ops->set_tso(dev, edata.data);
+ return actor(dev, edata.data);
}
-static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
+static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
+ char __user *useraddr)
{
- struct ethtool_value edata = { ETHTOOL_GTSO };
+ struct ethtool_flash efl;
+
+ if (copy_from_user(&efl, useraddr, sizeof(efl)))
+ return -EFAULT;
- if (!dev->ethtool_ops->get_ufo)
+ if (!dev->ethtool_ops->flash_device)
return -EOPNOTSUPP;
- edata.data = dev->ethtool_ops->get_ufo(dev);
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
+
+ efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+
+ return dev->ethtool_ops->flash_device(dev, &efl);
}
-static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+
+static int ethtool_set_dump(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_value edata;
+ struct ethtool_dump dump;
- if (!dev->ethtool_ops->set_ufo)
+ if (!dev->ethtool_ops->set_dump)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
- if (edata.data && !(dev->features & NETIF_F_HW_CSUM))
- return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, edata.data);
+
+ return dev->ethtool_ops->set_dump(dev, &dump);
}
-static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+static int ethtool_get_dump_flag(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_test test;
- struct ethtool_ops *ops = dev->ethtool_ops;
- u64 *data;
int ret;
+ struct ethtool_dump dump;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!ops->self_test || !ops->self_test_count)
+ if (!ops->get_dump_flag)
return -EOPNOTSUPP;
- if (copy_from_user(&test, useraddr, sizeof(test)))
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
return -EFAULT;
- test.len = ops->self_test_count(dev);
- data = kmalloc(test.len * sizeof(u64), GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ops->self_test(dev, &test, data);
-
- ret = -EFAULT;
- if (copy_to_user(useraddr, &test, sizeof(test)))
- goto out;
- useraddr += sizeof(test);
- if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
- goto out;
- ret = 0;
+ ret = ops->get_dump_flag(dev, &dump);
+ if (ret)
+ return ret;
- out:
- kfree(data);
- return ret;
+ if (copy_to_user(useraddr, &dump, sizeof(dump)))
+ return -EFAULT;
+ return 0;
}
-static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_dump_data(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_gstrings gstrings;
- struct ethtool_ops *ops = dev->ethtool_ops;
- u8 *data;
int ret;
+ __u32 len;
+ struct ethtool_dump dump, tmp;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data = NULL;
- if (!ops->get_strings)
+ if (!ops->get_dump_data || !ops->get_dump_flag)
return -EOPNOTSUPP;
- if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
return -EFAULT;
- switch (gstrings.string_set) {
- case ETH_SS_TEST:
- if (!ops->self_test_count)
- return -EOPNOTSUPP;
- gstrings.len = ops->self_test_count(dev);
- break;
- case ETH_SS_STATS:
- if (!ops->get_stats_count)
- return -EOPNOTSUPP;
- gstrings.len = ops->get_stats_count(dev);
- break;
- default:
- return -EINVAL;
- }
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+ ret = ops->get_dump_flag(dev, &tmp);
+ if (ret)
+ return ret;
- data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ len = min(tmp.len, dump.len);
+ if (!len)
+ return -EFAULT;
+
+ /* Don't ever let the driver think there's more space available
+ * than it requested with .get_dump_flag().
+ */
+ dump.len = len;
+
+ /* Always allocate enough space to hold the whole thing so that the
+ * driver does not need to check the length and bother with partial
+ * dumping.
+ */
+ data = vzalloc(tmp.len);
if (!data)
return -ENOMEM;
+ ret = ops->get_dump_data(dev, &dump, data);
+ if (ret)
+ goto out;
- ops->get_strings(dev, gstrings.string_set, data);
+ /* There are two sane possibilities:
+ * 1. The driver's .get_dump_data() does not touch dump.len.
+ * 2. Or it may set dump.len to how much it really writes, which
+ * should be tmp.len (or len if it can do a partial dump).
+ * In any case respond to userspace with the actual length of data
+ * it's receiving.
+ */
+ WARN_ON(dump.len != len && dump.len != tmp.len);
+ dump.len = len;
- ret = -EFAULT;
- if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
- goto out;
- useraddr += sizeof(gstrings);
- if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+ ret = -EFAULT;
goto out;
- ret = 0;
-
- out:
- kfree(data);
+ }
+ useraddr += offsetof(struct ethtool_dump, data);
+ if (copy_to_user(useraddr, data, len))
+ ret = -EFAULT;
+out:
+ vfree(data);
return ret;
}
-static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_value id;
+ int err = 0;
+ struct ethtool_ts_info info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
- if (!dev->ethtool_ops->phys_id)
- return -EOPNOTSUPP;
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phydev && phydev->drv && phydev->drv->ts_info) {
+ err = phydev->drv->ts_info(phydev, &info);
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, &info);
+ } else {
+ info.so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info.phc_index = -1;
+ }
- if (copy_from_user(&id, useraddr, sizeof(id)))
- return -EFAULT;
+ if (err)
+ return err;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ err = -EFAULT;
- return dev->ethtool_ops->phys_id(dev, id.data);
+ return err;
}
-static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_module_info(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_stats stats;
- struct ethtool_ops *ops = dev->ethtool_ops;
- u64 *data;
int ret;
+ struct ethtool_modinfo modinfo;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!ops->get_ethtool_stats || !ops->get_stats_count)
+ if (!ops->get_module_info)
return -EOPNOTSUPP;
- if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
return -EFAULT;
- stats.n_stats = ops->get_stats_count(dev);
- data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ops->get_ethtool_stats(dev, &stats, data);
+ ret = ops->get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
- ret = -EFAULT;
- if (copy_to_user(useraddr, &stats, sizeof(stats)))
- goto out;
- useraddr += sizeof(stats);
- if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
- goto out;
- ret = 0;
+ if (copy_to_user(useraddr, &modinfo, sizeof(modinfo)))
+ return -EFAULT;
- out:
- kfree(data);
- return ret;
+ return 0;
}
-static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_module_eeprom(struct net_device *dev,
+ void __user *useraddr)
{
- struct ethtool_perm_addr epaddr;
- u8 *data;
int ret;
+ struct ethtool_modinfo modinfo;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!dev->ethtool_ops->get_perm_addr)
+ if (!ops->get_module_info || !ops->get_module_eeprom)
return -EOPNOTSUPP;
- if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
- return -EFAULT;
-
- data = kmalloc(epaddr.size, GFP_USER);
- if (!data)
- return -ENOMEM;
-
- ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
+ ret = ops->get_module_info(dev, &modinfo);
if (ret)
return ret;
- ret = -EFAULT;
- if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
- goto out;
- useraddr += sizeof(epaddr);
- if (copy_to_user(useraddr, data, epaddr.size))
- goto out;
- ret = 0;
-
- out:
- kfree(data);
- return ret;
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom,
+ modinfo.eeprom_len);
}
-/* The main entry point in this file. Called from net/core/dev.c */
+/* The main entry point in this file. Called from net/core/dev_ioctl.c */
-int dev_ethtool(struct ifreq *ifr)
+int dev_ethtool(struct net *net, struct ifreq *ifr)
{
- struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
void __user *useraddr = ifr->ifr_data;
u32 ethcmd;
int rc;
- unsigned long old_features;
-
- /*
- * XXX: This can be pushed down into the ethtool_* handlers that
- * need it. Keep existing behaviour for the moment.
- */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ netdev_features_t old_features;
if (!dev || !netif_device_present(dev))
return -ENODEV;
- if (!dev->ethtool_ops)
- goto ioctl;
-
- if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd)))
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
- if(dev->ethtool_ops->begin)
- if ((rc = dev->ethtool_ops->begin(dev)) < 0)
- return rc;
+ /* Allow some commands to be done by anyone */
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ case ETHTOOL_GDRVINFO:
+ case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GLINK:
+ case ETHTOOL_GCOALESCE:
+ case ETHTOOL_GRINGPARAM:
+ case ETHTOOL_GPAUSEPARAM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GSSET_INFO:
+ case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GSTATS:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GPFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
+ case ETHTOOL_GFEATURES:
+ case ETHTOOL_GCHANNELS:
+ case ETHTOOL_GET_TS_INFO:
+ case ETHTOOL_GEEE:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ return rc;
+ }
old_features = dev->features;
switch (ethcmd) {
@@ -826,10 +1703,18 @@ int dev_ethtool(struct ifreq *ifr)
rc = ethtool_set_wol(dev, useraddr);
break;
case ETHTOOL_GMSGLVL:
- rc = ethtool_get_msglevel(dev, useraddr);
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_msglevel);
break;
case ETHTOOL_SMSGLVL:
- rc = ethtool_set_msglevel(dev, useraddr);
+ rc = ethtool_set_value_void(dev, useraddr,
+ dev->ethtool_ops->set_msglevel);
+ break;
+ case ETHTOOL_GEEE:
+ rc = ethtool_get_eee(dev, useraddr);
+ break;
+ case ETHTOOL_SEEE:
+ rc = ethtool_set_eee(dev, useraddr);
break;
case ETHTOOL_NWAY_RST:
rc = ethtool_nway_reset(dev);
@@ -861,30 +1746,6 @@ int dev_ethtool(struct ifreq *ifr)
case ETHTOOL_SPAUSEPARAM:
rc = ethtool_set_pauseparam(dev, useraddr);
break;
- case ETHTOOL_GRXCSUM:
- rc = ethtool_get_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_SRXCSUM:
- rc = ethtool_set_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_GTXCSUM:
- rc = ethtool_get_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_STXCSUM:
- rc = ethtool_set_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_GSG:
- rc = ethtool_get_sg(dev, useraddr);
- break;
- case ETHTOOL_SSG:
- rc = ethtool_set_sg(dev, useraddr);
- break;
- case ETHTOOL_GTSO:
- rc = ethtool_get_tso(dev, useraddr);
- break;
- case ETHTOOL_STSO:
- rc = ethtool_set_tso(dev, useraddr);
- break;
case ETHTOOL_TEST:
rc = ethtool_self_test(dev, useraddr);
break;
@@ -900,39 +1761,111 @@ int dev_ethtool(struct ifreq *ifr)
case ETHTOOL_GPERMADDR:
rc = ethtool_get_perm_addr(dev, useraddr);
break;
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GPFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_priv_flags);
+ break;
+ case ETHTOOL_SPFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_priv_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_FLASHDEV:
+ rc = ethtool_flash_device(dev, useraddr);
+ break;
+ case ETHTOOL_RESET:
+ rc = ethtool_reset(dev, useraddr);
+ break;
+ case ETHTOOL_GSSET_INFO:
+ rc = ethtool_get_sset_info(dev, useraddr);
+ break;
+ case ETHTOOL_GRXFHINDIR:
+ rc = ethtool_get_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_SRXFHINDIR:
+ rc = ethtool_set_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
case ETHTOOL_GUFO:
- rc = ethtool_get_ufo(dev, useraddr);
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
case ETHTOOL_SUFO:
- rc = ethtool_set_ufo(dev, useraddr);
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SCHANNELS:
+ rc = ethtool_set_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SET_DUMP:
+ rc = ethtool_set_dump(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_FLAG:
+ rc = ethtool_get_dump_flag(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_DATA:
+ rc = ethtool_get_dump_data(dev, useraddr);
+ break;
+ case ETHTOOL_GET_TS_INFO:
+ rc = ethtool_get_ts_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEINFO:
+ rc = ethtool_get_module_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEEEPROM:
+ rc = ethtool_get_module_eeprom(dev, useraddr);
break;
default:
- rc = -EOPNOTSUPP;
+ rc = -EOPNOTSUPP;
}
-
- if(dev->ethtool_ops->complete)
+
+ if (dev->ethtool_ops->complete)
dev->ethtool_ops->complete(dev);
if (old_features != dev->features)
netdev_features_change(dev);
return rc;
-
- ioctl:
- if (dev->do_ioctl)
- return dev->do_ioctl(dev, ifr, SIOCETHTOOL);
- return -EOPNOTSUPP;
}
-
-EXPORT_SYMBOL(dev_ethtool);
-EXPORT_SYMBOL(ethtool_op_get_link);
-EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
-EXPORT_SYMBOL(ethtool_op_get_sg);
-EXPORT_SYMBOL(ethtool_op_get_tso);
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
-EXPORT_SYMBOL(ethtool_op_set_sg);
-EXPORT_SYMBOL(ethtool_op_set_tso);
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-EXPORT_SYMBOL(ethtool_op_get_ufo);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 00000000000..185c341fafb
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,805 @@
+/*
+ * net/core/fib_rules.c Generic Routing Rules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/fib_rules.h>
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+ u32 pref, u32 table, u32 flags)
+{
+ struct fib_rule *r;
+
+ r = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ atomic_set(&r->refcnt, 1);
+ r->action = FR_ACT_TO_TBL;
+ r->pref = pref;
+ r->table = table;
+ r->flags = flags;
+ r->fr_net = hold_net(ops->fro_net);
+
+ r->suppress_prefixlen = -1;
+ r->suppress_ifgroup = -1;
+
+ /* The lock is not required here, the list in unreacheable
+ * at the moment this function is called */
+ list_add_tail(&r->list, &ops->rules_list);
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+ struct list_head *pos;
+ struct fib_rule *rule;
+
+ if (!list_empty(&ops->rules_list)) {
+ pos = ops->rules_list.next;
+ if (pos->next != &ops->rules_list) {
+ rule = list_entry(pos->next, struct fib_rule, list);
+ if (rule->pref)
+ return rule->pref - 1;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_pref);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid);
+
+static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+{
+ struct fib_rules_ops *ops;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (ops->family == family) {
+ if (!try_module_get(ops->owner))
+ ops = NULL;
+ rcu_read_unlock();
+ return ops;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+ if (ops)
+ module_put(ops->owner);
+}
+
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+ if (ops->flush_cache)
+ ops->flush_cache(ops);
+}
+
+static int __fib_rules_register(struct fib_rules_ops *ops)
+{
+ int err = -EEXIST;
+ struct fib_rules_ops *o;
+ struct net *net;
+
+ net = ops->fro_net;
+
+ if (ops->rule_size < sizeof(struct fib_rule))
+ return -EINVAL;
+
+ if (ops->match == NULL || ops->configure == NULL ||
+ ops->compare == NULL || ops->fill == NULL ||
+ ops->action == NULL)
+ return -EINVAL;
+
+ spin_lock(&net->rules_mod_lock);
+ list_for_each_entry(o, &net->rules_ops, list)
+ if (ops->family == o->family)
+ goto errout;
+
+ hold_net(net);
+ list_add_tail_rcu(&ops->list, &net->rules_ops);
+ err = 0;
+errout:
+ spin_unlock(&net->rules_mod_lock);
+
+ return err;
+}
+
+struct fib_rules_ops *
+fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ err = __fib_rules_register(ops);
+ if (err) {
+ kfree(ops);
+ ops = ERR_PTR(err);
+ }
+
+ return ops;
+}
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+{
+ struct fib_rule *rule, *tmp;
+
+ list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
+ list_del_rcu(&rule->list);
+ if (ops->delete)
+ ops->delete(rule);
+ fib_rule_put(rule);
+ }
+}
+
+static void fib_rules_put_rcu(struct rcu_head *head)
+{
+ struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
+ struct net *net = ops->fro_net;
+
+ release_net(net);
+ kfree(ops);
+}
+
+void fib_rules_unregister(struct fib_rules_ops *ops)
+{
+ struct net *net = ops->fro_net;
+
+ spin_lock(&net->rules_mod_lock);
+ list_del_rcu(&ops->list);
+ fib_rules_cleanup_ops(ops);
+ spin_unlock(&net->rules_mod_lock);
+
+ call_rcu(&ops->rcu, fib_rules_put_rcu);
+}
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
+ struct flowi *fl, int flags)
+{
+ int ret = 0;
+
+ if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ goto out;
+
+ if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ goto out;
+
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
+ goto out;
+
+ ret = ops->match(rule, fl, flags);
+out:
+ return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
+}
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct fib_rule *rule;
+ int err;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+ if (!fib_rule_match(rule, ops, fl, flags))
+ continue;
+
+ if (rule->action == FR_ACT_GOTO) {
+ struct fib_rule *target;
+
+ target = rcu_dereference(rule->ctarget);
+ if (target == NULL) {
+ continue;
+ } else {
+ rule = target;
+ goto jumped;
+ }
+ } else if (rule->action == FR_ACT_NOP)
+ continue;
+ else
+ err = ops->action(rule, fl, flags, arg);
+
+ if (!err && ops->suppress && ops->suppress(rule, arg))
+ continue;
+
+ if (err != -EAGAIN) {
+ if ((arg->flags & FIB_LOOKUP_NOREF) ||
+ likely(atomic_inc_not_zero(&rule->refcnt))) {
+ arg->rule = rule;
+ goto out;
+ }
+ break;
+ }
+ }
+
+ err = -ESRCH;
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct fib_rules_ops *ops)
+{
+ int err = -EINVAL;
+
+ if (frh->src_len)
+ if (tb[FRA_SRC] == NULL ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size)
+ goto errout;
+
+ if (frh->dst_len)
+ if (tb[FRA_DST] == NULL ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size)
+ goto errout;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL, unresolved = 0;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ rule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (rule == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ rule->fr_net = hold_net(net);
+
+ if (tb[FRA_PRIORITY])
+ rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ rule->iifindex = -1;
+ nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, rule->iifname);
+ if (dev)
+ rule->iifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ rule->oifindex = -1;
+ nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, rule->oifname);
+ if (dev)
+ rule->oifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_FWMARK]) {
+ rule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (rule->mark)
+ /* compatibility: if the mark value is non-zero all bits
+ * are compared unless a mask is explicitly specified.
+ */
+ rule->mark_mask = 0xFFFFFFFF;
+ }
+
+ if (tb[FRA_FWMASK])
+ rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+ rule->action = frh->action;
+ rule->flags = frh->flags;
+ rule->table = frh_get_table(frh, tb);
+ if (tb[FRA_SUPPRESS_PREFIXLEN])
+ rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ else
+ rule->suppress_prefixlen = -1;
+
+ if (tb[FRA_SUPPRESS_IFGROUP])
+ rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ else
+ rule->suppress_ifgroup = -1;
+
+ if (!tb[FRA_PRIORITY] && ops->default_pref)
+ rule->pref = ops->default_pref(ops);
+
+ err = -EINVAL;
+ if (tb[FRA_GOTO]) {
+ if (rule->action != FR_ACT_GOTO)
+ goto errout_free;
+
+ rule->target = nla_get_u32(tb[FRA_GOTO]);
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (rule->target <= rule->pref)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+ } else if (rule->action == FR_ACT_GOTO)
+ goto errout_free;
+
+ err = ops->configure(rule, skb, frh, tb);
+ if (err < 0)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref > rule->pref)
+ break;
+ last = r;
+ }
+
+ fib_rule_get(rule);
+
+ if (last)
+ list_add_rcu(&rule->list, &last->list);
+ else
+ list_add_rcu(&rule->list, &ops->rules_list);
+
+ if (ops->unresolved_rules) {
+ /*
+ * There are unresolved goto rules in the list, check if
+ * any of them are pointing to this new rule.
+ */
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action == FR_ACT_GOTO &&
+ r->target == rule->pref &&
+ rtnl_dereference(r->ctarget) == NULL) {
+ rcu_assign_pointer(r->ctarget, rule);
+ if (--ops->unresolved_rules == 0)
+ break;
+ }
+ }
+ }
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules++;
+
+ if (unresolved)
+ ops->unresolved_rules++;
+
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+
+errout_free:
+ release_net(rule->fr_net);
+ kfree(rule);
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *tmp;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ list_for_each_entry(rule, &ops->rules_list, list) {
+ if (frh->action && (frh->action != rule->action))
+ continue;
+
+ if (frh_get_table(frh, tb) &&
+ (frh_get_table(frh, tb) != rule->table))
+ continue;
+
+ if (tb[FRA_PRIORITY] &&
+ (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
+ continue;
+
+ if (tb[FRA_IIFNAME] &&
+ nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
+ continue;
+
+ if (tb[FRA_OIFNAME] &&
+ nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
+ continue;
+
+ if (tb[FRA_FWMARK] &&
+ (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
+ continue;
+
+ if (tb[FRA_FWMASK] &&
+ (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
+ continue;
+
+ if (!ops->compare(rule, frh, tb))
+ continue;
+
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
+ }
+
+ list_del_rcu(&rule->list);
+
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
+
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules have
+ * actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ list_for_each_entry(tmp, &ops->rules_list, list) {
+ if (rtnl_dereference(tmp->ctarget) == rule) {
+ RCU_INIT_POINTER(tmp->ctarget, NULL);
+ ops->unresolved_rules++;
+ }
+ }
+ }
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ if (ops->delete)
+ ops->delete(rule);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+ }
+
+ err = -ENOENT;
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ struct fib_rule *rule)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+ + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ + nla_total_size(4) /* FRA_PRIORITY */
+ + nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ + nla_total_size(4) /* FRA_FWMARK */
+ + nla_total_size(4); /* FRA_FWMASK */
+
+ if (ops->nlmsg_payload)
+ payload += ops->nlmsg_payload(rule);
+
+ return payload;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+ u32 pid, u32 seq, int type, int flags,
+ struct fib_rules_ops *ops)
+{
+ struct nlmsghdr *nlh;
+ struct fib_rule_hdr *frh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ frh = nlmsg_data(nlh);
+ frh->family = ops->family;
+ frh->table = rule->table;
+ if (nla_put_u32(skb, FRA_TABLE, rule->table))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
+ goto nla_put_failure;
+ frh->res1 = 0;
+ frh->res2 = 0;
+ frh->action = rule->action;
+ frh->flags = rule->flags;
+
+ if (rule->action == FR_ACT_GOTO &&
+ rcu_access_pointer(rule->ctarget) == NULL)
+ frh->flags |= FIB_RULE_UNRESOLVED;
+
+ if (rule->iifname[0]) {
+ if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
+ goto nla_put_failure;
+ if (rule->iifindex == -1)
+ frh->flags |= FIB_RULE_IIF_DETACHED;
+ }
+
+ if (rule->oifname[0]) {
+ if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
+ goto nla_put_failure;
+ if (rule->oifindex == -1)
+ frh->flags |= FIB_RULE_OIF_DETACHED;
+ }
+
+ if ((rule->pref &&
+ nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
+ (rule->mark &&
+ nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
+ ((rule->mark_mask || rule->mark) &&
+ nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
+ (rule->target &&
+ nla_put_u32(skb, FRA_GOTO, rule->target)))
+ goto nla_put_failure;
+
+ if (rule->suppress_ifgroup != -1) {
+ if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+ goto nla_put_failure;
+ }
+
+ if (ops->fill(rule, skb, frh) < 0)
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_rules_ops *ops)
+{
+ int idx = 0;
+ struct fib_rule *rule;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ if (idx < cb->args[1])
+ goto skip;
+
+ if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWRULE,
+ NLM_F_MULTI, ops) < 0)
+ break;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ rules_ops_put(ops);
+
+ return skb->len;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rules_ops *ops;
+ int idx = 0, family;
+
+ family = rtnl_msg_family(cb->nlh);
+ if (family != AF_UNSPEC) {
+ /* Protocol specific dump request */
+ ops = lookup_rules_ops(net, family);
+ if (ops == NULL)
+ return -EAFNOSUPPORT;
+
+ return dump_rules(skb, cb, ops);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (idx < cb->args[0] || !try_module_get(ops->owner))
+ goto skip;
+
+ if (dump_rules(skb, cb, ops) < 0)
+ break;
+
+ cb->args[1] = 0;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct net *net;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ net = ops->fro_net;
+ skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, ops->nlgroup, err);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == -1 &&
+ strcmp(dev->name, rule->iifname) == 0)
+ rule->iifindex = dev->ifindex;
+ if (rule->oifindex == -1 &&
+ strcmp(dev->name, rule->oifname) == 0)
+ rule->oifindex = dev->ifindex;
+ }
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == dev->ifindex)
+ rule->iifindex = -1;
+ if (rule->oifindex == dev->ifindex)
+ rule->oifindex = -1;
+ }
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ attach_rules(&ops->rules_list, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ detach_rules(&ops->rules_list, dev);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call = fib_rules_event,
+};
+
+static int __net_init fib_rules_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->rules_ops);
+ spin_lock_init(&net->rules_mod_lock);
+ return 0;
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+ .init = fib_rules_net_init,
+};
+
+static int __init fib_rules_init(void)
+{
+ int err;
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
+
+ err = register_pernet_subsys(&fib_rules_net_ops);
+ if (err < 0)
+ goto fail;
+
+ err = register_netdevice_notifier(&fib_rules_notifier);
+ if (err < 0)
+ goto fail_unregister;
+
+ return 0;
+
+fail_unregister:
+ unregister_pernet_subsys(&fib_rules_net_ops);
+fail:
+ rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ return err;
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
index 9540946a48f..1dbf6462f76 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
/*
* Linux Socket Filter - Kernel level socket filtering
*
- * Author:
- * Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
*
- * Based on the design of:
- * - The Berkeley Packet Filter
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -18,7 +23,6 @@
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
@@ -26,254 +30,1187 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
+#include <net/netlink.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
+#include <asm/unaligned.h>
#include <linux/filter.h>
+#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
+#include <linux/if_vlan.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
-/* No hurry in this branch */
-static void *__load_pointer(struct sk_buff *skb, int k)
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
u8 *ptr = NULL;
if (k >= SKF_NET_OFF)
- ptr = skb->nh.raw + k - SKF_NET_OFF;
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
- ptr = skb->mac.raw + k - SKF_LL_OFF;
-
- if (ptr >= skb->head && ptr < skb->tail)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
+
return NULL;
}
-static inline void *load_pointer(struct sk_buff *skb, int k,
- unsigned int size, void *buffer)
+static inline void *load_pointer(const struct sk_buff *skb, int k,
+ unsigned int size, void *buffer)
{
if (k >= 0)
return skb_header_pointer(skb, k, size, buffer);
- else {
- if (k >= SKF_AD_OFF)
- return NULL;
- return __load_pointer(skb, k);
- }
+
+ return bpf_internal_load_pointer_neg_helper(skb, k, size);
}
/**
- * sk_run_filter - run a filter on a socket
- * @skb: buffer to run the filter on
- * @filter: filter to apply
- * @flen: length of filter
+ * sk_filter - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
*
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. skb is the data we are
- * filtering, filter is the array of filter instructions, and
- * len is the number of filter blocks in the array.
*/
-unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+int sk_filter(struct sock *sk, struct sk_buff *skb)
{
- struct sock_filter *fentry; /* We walk down these */
- void *ptr;
- u32 A = 0; /* Accumulator */
- u32 X = 0; /* Index Register */
- u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
- u32 tmp;
- int k;
- int pc;
+ int err;
+ struct sk_filter *filter;
/*
- * Process array of filter instructions.
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
*/
- for (pc = 0; pc < flen; pc++) {
- fentry = &filter[pc];
-
- switch (fentry->code) {
- case BPF_ALU|BPF_ADD|BPF_X:
- A += X;
- continue;
- case BPF_ALU|BPF_ADD|BPF_K:
- A += fentry->k;
- continue;
- case BPF_ALU|BPF_SUB|BPF_X:
- A -= X;
- continue;
- case BPF_ALU|BPF_SUB|BPF_K:
- A -= fentry->k;
- continue;
- case BPF_ALU|BPF_MUL|BPF_X:
- A *= X;
- continue;
- case BPF_ALU|BPF_MUL|BPF_K:
- A *= fentry->k;
- continue;
- case BPF_ALU|BPF_DIV|BPF_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_ALU|BPF_DIV|BPF_K:
- A /= fentry->k;
- continue;
- case BPF_ALU|BPF_AND|BPF_X:
- A &= X;
- continue;
- case BPF_ALU|BPF_AND|BPF_K:
- A &= fentry->k;
- continue;
- case BPF_ALU|BPF_OR|BPF_X:
- A |= X;
- continue;
- case BPF_ALU|BPF_OR|BPF_K:
- A |= fentry->k;
- continue;
- case BPF_ALU|BPF_LSH|BPF_X:
- A <<= X;
- continue;
- case BPF_ALU|BPF_LSH|BPF_K:
- A <<= fentry->k;
- continue;
- case BPF_ALU|BPF_RSH|BPF_X:
- A >>= X;
- continue;
- case BPF_ALU|BPF_RSH|BPF_K:
- A >>= fentry->k;
- continue;
- case BPF_ALU|BPF_NEG:
- A = -A;
- continue;
- case BPF_JMP|BPF_JA:
- pc += fentry->k;
- continue;
- case BPF_JMP|BPF_JGT|BPF_K:
- pc += (A > fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGE|BPF_K:
- pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JEQ|BPF_K:
- pc += (A == fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JSET|BPF_K:
- pc += (A & fentry->k) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGT|BPF_X:
- pc += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JGE|BPF_X:
- pc += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JEQ|BPF_X:
- pc += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_JMP|BPF_JSET|BPF_X:
- pc += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_LD|BPF_W|BPF_ABS:
- k = fentry->k;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = ntohl(*(u32 *)ptr);
- continue;
- }
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ return -ENOMEM;
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err)
+ return err;
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+
+ err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+/**
+ * __sk_run_filter - run a filter on a given context
+ * @ctx: buffer to run the filter on
+ * @insn: filter to apply
+ *
+ * Decode and apply filter instructions to the skb->data. Return length to
+ * keep, 0 for none. @ctx is the data we are operating on, @insn is the
+ * array of filter instructions.
+ */
+static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ };
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
break;
- case BPF_LD|BPF_H|BPF_ABS:
- k = fentry->k;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = ntohs(*(u16 *)ptr);
- continue;
- }
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
break;
- case BPF_LD|BPF_B|BPF_ABS:
- k = fentry->k;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
- }
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+/* Helper to find the offset of pkt_type in sk_buff structure. We want
+ * to make sure its still a 3bit field starting at a byte boundary;
+ * taken from arch/x86/net/bpf_jit_comp.c.
+ */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX (7 << 5)
+#else
+#define PKT_TYPE_MAX 7
+#endif
+static unsigned int pkt_type_offset(void)
+{
+ struct sk_buff skb_probe = { .pkt_type = ~0, };
+ u8 *ct = (u8 *) &skb_probe;
+ unsigned int off;
+
+ for (off = 0; off < sizeof(struct sk_buff); off++) {
+ if (ct[off] == PKT_TYPE_MAX)
+ return off;
+ }
+
+ pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
+ return -1;
+}
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct sock_filter_int **insnp)
+{
+ struct sock_filter_int *insn = *insnp;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ pkt_type_offset());
+ if (insn->off < 0)
+ return false;
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
+#endif
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_tci));
+ if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* A >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
+ /* A &= 1 */
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(__get_random_u32);
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * sk_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: buffer where converted program will be stored
+ * @new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
+ * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
+ */
+int sk_convert_filter(struct sock_filter *prog, int len,
+ struct sock_filter_int *new_prog, int *new_len)
+{
+ int new_flen = 0, pass = 0, target, i;
+ struct sock_filter_int *new_insn;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = new_prog;
+ fp = prog;
+
+ if (new_insn)
+ *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ new_insn++;
+
+ for (i = 0; i < len; fp++, i++) {
+ struct sock_filter_int tmp_insns[6] = { };
+ struct sock_filter_int *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - new_prog;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ if (target >= len || target < 0) \
+ goto err; \
+ insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ insn->off -= insn - tmp_insns; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
break;
- case BPF_LD|BPF_W|BPF_LEN:
- A = skb->len;
- continue;
- case BPF_LDX|BPF_W|BPF_LEN:
- X = skb->len;
- continue;
- case BPF_LD|BPF_W|BPF_IND:
- k = X + fentry->k;
- goto load_w;
- case BPF_LD|BPF_H|BPF_IND:
- k = X + fentry->k;
- goto load_h;
- case BPF_LD|BPF_B|BPF_IND:
- k = X + fentry->k;
- goto load_b;
- case BPF_LDX|BPF_B|BPF_MSH:
- ptr = load_pointer(skb, fentry->k, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_X;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
}
- return 0;
- case BPF_LD|BPF_IMM:
- A = fentry->k;
- continue;
- case BPF_LDX|BPF_IMM:
- X = fentry->k;
- continue;
- case BPF_LD|BPF_MEM:
- A = mem[fentry->k];
- continue;
- case BPF_LDX|BPF_MEM:
- X = mem[fentry->k];
- continue;
- case BPF_MISC|BPF_TAX:
- X = A;
- continue;
- case BPF_MISC|BPF_TXA:
- A = X;
- continue;
- case BPF_RET|BPF_K:
- return fentry->k;
- case BPF_RET|BPF_A:
- return A;
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert JEQ into JNE when 'jump_true' is next insn. */
+ if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B:
+ /* tmp = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+
+ /* RET_K, RET_A are remaped into 2 insns. */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+ BPF_K : BPF_X, BPF_REG_0,
+ BPF_REG_A, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
case BPF_ST:
- mem[fentry->k] = A;
- continue;
case BPF_STX:
- mem[fentry->k] = X;
- continue;
- default:
- WARN_ON(1);
- return 0;
- }
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
- /*
- * Handle ancillary data, which are impossible
- * (or very difficult) to get parsing packet contents.
- */
- switch (k-SKF_AD_OFF) {
- case SKF_AD_PROTOCOL:
- A = htons(skb->protocol);
- continue;
- case SKF_AD_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case SKF_AD_IFINDEX:
- A = skb->dev->ifindex;
- continue;
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unkown instruction. */
default:
- return 0;
+ goto err;
}
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
}
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - new_prog;
+ return 0;
+ }
+
+ pass++;
+ if (new_flen != new_insn - new_prog) {
+ new_flen = new_insn - new_prog;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
+ }
+
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
+}
+
+/* Security:
+ *
+ * A BPF program is able to use 16 cells of memory to store intermediate
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ *
+ * As we dont want to clear mem[] array for each packet going through
+ * sk_run_filter(), we check that filter loaded by user never try to read
+ * a cell if not previously written, and we check all branches to be sure
+ * a malicious user doesn't try to abuse us.
+ */
+static int check_load_and_stores(struct sock_filter *filter, int flen)
+{
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
+ int pc, ret = 0;
+
+ BUILD_BUG_ON(BPF_MEMWORDS > 16);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return -ENOMEM;
+
+ memset(masks, 0xff, flen * sizeof(*masks));
+
+ for (pc = 0; pc < flen; pc++) {
+ memvalid &= masks[pc];
+
+ switch (filter[pc].code) {
+ case BPF_ST:
+ case BPF_STX:
+ memvalid |= (1 << filter[pc].k);
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ if (!(memvalid & (1 << filter[pc].k))) {
+ ret = -EINVAL;
+ goto error;
+ }
+ break;
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
+ masks[pc + 1 + filter[pc].k] &= memvalid;
+ memvalid = ~0;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
+ masks[pc + 1 + filter[pc].jt] &= memvalid;
+ masks[pc + 1 + filter[pc].jf] &= memvalid;
+ memvalid = ~0;
+ break;
+ }
+ }
+error:
+ kfree(masks);
+ return ret;
+}
+
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
}
/**
@@ -290,101 +1227,352 @@ load_b:
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, int flen)
+int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
{
- struct sock_filter *ftest;
+ bool anc_found;
int pc;
if (flen == 0 || flen > BPF_MAXINSNS)
return -EINVAL;
- /* check the filter code now */
+ /* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
- ftest = &filter[pc];
+ struct sock_filter *ftest = &filter[pc];
- /* Only allow valid instructions */
- switch (ftest->code) {
- case BPF_ALU|BPF_ADD|BPF_K:
- case BPF_ALU|BPF_ADD|BPF_X:
- case BPF_ALU|BPF_SUB|BPF_K:
- case BPF_ALU|BPF_SUB|BPF_X:
- case BPF_ALU|BPF_MUL|BPF_K:
- case BPF_ALU|BPF_MUL|BPF_X:
- case BPF_ALU|BPF_DIV|BPF_X:
- case BPF_ALU|BPF_AND|BPF_K:
- case BPF_ALU|BPF_AND|BPF_X:
- case BPF_ALU|BPF_OR|BPF_K:
- case BPF_ALU|BPF_OR|BPF_X:
- case BPF_ALU|BPF_LSH|BPF_K:
- case BPF_ALU|BPF_LSH|BPF_X:
- case BPF_ALU|BPF_RSH|BPF_K:
- case BPF_ALU|BPF_RSH|BPF_X:
- case BPF_ALU|BPF_NEG:
- case BPF_LD|BPF_W|BPF_ABS:
- case BPF_LD|BPF_H|BPF_ABS:
- case BPF_LD|BPF_B|BPF_ABS:
- case BPF_LD|BPF_W|BPF_LEN:
- case BPF_LD|BPF_W|BPF_IND:
- case BPF_LD|BPF_H|BPF_IND:
- case BPF_LD|BPF_B|BPF_IND:
- case BPF_LD|BPF_IMM:
- case BPF_LDX|BPF_W|BPF_LEN:
- case BPF_LDX|BPF_B|BPF_MSH:
- case BPF_LDX|BPF_IMM:
- case BPF_MISC|BPF_TAX:
- case BPF_MISC|BPF_TXA:
- case BPF_RET|BPF_K:
- case BPF_RET|BPF_A:
- break;
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
+ return -EINVAL;
/* Some instructions need special checks */
-
- case BPF_ALU|BPF_DIV|BPF_K:
- /* check for division by zero */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
if (ftest->k == 0)
return -EINVAL;
break;
-
- case BPF_LD|BPF_MEM:
- case BPF_LDX|BPF_MEM:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
case BPF_ST:
case BPF_STX:
- /* check for invalid memory addresses */
+ /* Check for invalid memory addresses */
if (ftest->k >= BPF_MEMWORDS)
return -EINVAL;
break;
-
- case BPF_JMP|BPF_JA:
- /*
- * Note, the large ftest->k might cause loops.
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
* Compare this with conditional jumps below,
* where offsets are limited. --ANK (981016)
*/
- if (ftest->k >= (unsigned)(flen-pc-1))
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
return -EINVAL;
break;
-
- case BPF_JMP|BPF_JEQ|BPF_K:
- case BPF_JMP|BPF_JEQ|BPF_X:
- case BPF_JMP|BPF_JGE|BPF_K:
- case BPF_JMP|BPF_JGE|BPF_X:
- case BPF_JMP|BPF_JGT|BPF_K:
- case BPF_JMP|BPF_JGT|BPF_X:
- case BPF_JMP|BPF_JSET|BPF_K:
- case BPF_JMP|BPF_JSET|BPF_X:
- /* for conditionals both must be safe */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
if (pc + ftest->jt + 1 >= flen ||
pc + ftest->jf + 1 >= flen)
return -EINVAL;
break;
-
- default:
- return -EINVAL;
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ anc_found = false;
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
+ if (anc_found == false && ftest->k >= SKF_AD_OFF)
+ return -EINVAL;
}
}
- return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
+ /* Last instruction must be a RET code */
+ switch (filter[flen - 1].code) {
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ return check_load_and_stores(filter, flen);
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(sk_chk_filter);
+
+static int sk_store_orig_filter(struct sk_filter *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = sk_filter_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+ fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void sk_release_orig_filter(struct sk_filter *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
+/**
+ * sk_filter_release_rcu - Release a socket filter by rcu_head
+ * @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+ sk_release_orig_filter(fp);
+ sk_filter_free(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (atomic_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ atomic_inc(&fp->refcnt);
+ atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+}
+
+static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
+ struct sock *sk,
+ unsigned int len)
+{
+ struct sk_filter *fp_new;
+
+ if (sk == NULL)
+ return krealloc(fp, len, GFP_KERNEL);
+
+ fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
+ if (fp_new) {
+ *fp_new = *fp;
+ /* As we're keeping orig_prog in fp_new along,
+ * we need to make sure we're not evicting it
+ * from the old fp.
+ */
+ fp->orig_prog = NULL;
+ sk_filter_uncharge(sk, fp);
+ }
+
+ return fp_new;
+}
+
+static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
+ struct sock *sk)
+{
+ struct sock_filter *old_prog;
+ struct sk_filter *old_fp;
+ int err, new_len, old_len = fp->len;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct sock_filter_int));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
+ err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ if (err)
+ /* 2nd sk_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by __sk_migrate_realloc().
+ */
+ goto out_err_free;
+
+ sk_filter_select_runtime(fp);
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ /* Rollback filter setup. */
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+}
+
+void __weak bpf_int_jit_compile(struct sk_filter *prog)
+{
+}
+
+/**
+ * sk_filter_select_runtime - select execution runtime for BPF program
+ * @fp: sk_filter populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via SK_RUN_FILTER() macro
+ */
+void sk_filter_select_runtime(struct sk_filter *fp)
+{
+ fp->bpf_func = (void *) __sk_run_filter;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
+
+/* free internal BPF program */
+void sk_filter_free(struct sk_filter *fp)
+{
+ bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(sk_filter_free);
+
+static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
+ struct sock *sk)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = 0;
+
+ err = sk_chk_filter(fp->insns, fp->len);
+ if (err) {
+ if (sk != NULL)
+ sk_filter_uncharge(sk, fp);
+ else
+ kfree(fp);
+ return ERR_PTR(err);
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
+ bpf_jit_compile(fp);
+
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = __sk_migrate_filter(fp, sk);
+
+ return fp;
+}
+
+/**
+ * sk_unattached_filter_create - create an unattached filter
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ *
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
+ */
+int sk_unattached_filter_create(struct sk_filter **pfp,
+ struct sock_fprog_kern *fprog)
+{
+ unsigned int fsize = sk_filter_proglen(fprog);
+ struct sk_filter *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ memcpy(fp->insns, fprog->filter, fsize);
+
+ atomic_set(&fp->refcnt, 1);
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, NULL);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
+
+void sk_unattached_filter_destroy(struct sk_filter *fp)
+{
+ sk_filter_release(fp);
}
+EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
/**
* sk_attach_filter - attach a socket filter
@@ -398,40 +1586,110 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
*/
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct sk_filter *fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ struct sk_filter *fp, *old_fp;
+ unsigned int fsize = sk_filter_proglen(fprog);
+ unsigned int sk_fsize = sk_filter_size(fprog->len);
int err;
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return -EINVAL;
- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+ fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
if (!fp)
return -ENOMEM;
+
if (copy_from_user(fp->insns, fprog->filter, fsize)) {
- sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+ sock_kfree_s(sk, fp, sk_fsize);
return -EFAULT;
}
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
- err = sk_chk_filter(fp->insns, fp->len);
- if (!err) {
- struct sk_filter *old_fp;
+ err = sk_store_orig_filter(fp, fprog);
+ if (err) {
+ sk_filter_uncharge(sk, fp);
+ return -ENOMEM;
+ }
- spin_lock_bh(&sk->sk_lock.slock);
- old_fp = sk->sk_filter;
- sk->sk_filter = fp;
- spin_unlock_bh(&sk->sk_lock.slock);
- fp = old_fp;
+ /* __sk_prepare_filter() already takes care of uncharging
+ * memory in case something goes wrong.
+ */
+ fp = __sk_prepare_filter(fp, sk);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
+
+int sk_detach_filter(struct sock *sk)
+{
+ int ret = -ENOENT;
+ struct sk_filter *filter;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ if (filter) {
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
+ ret = 0;
}
- if (fp)
- sk_filter_release(sk, fp);
- return err;
+ return ret;
}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
-EXPORT_SYMBOL(sk_chk_filter);
-EXPORT_SYMBOL(sk_run_filter);
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ int ret = 0;
+
+ lock_sock(sk);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ if (!filter)
+ goto out;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore.
+ */
+ fprog = filter->orig_prog;
+
+ ret = fprog->len;
+ if (!len)
+ /* User space only enquires number of filter blocks. */
+ goto out;
+
+ ret = -EINVAL;
+ if (len < fprog->len)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
+ goto out;
+
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
+out:
+ release_sock(sk);
+ return ret;
+}
diff --git a/net/core/flow.c b/net/core/flow.c
index c4f25385029..a0348fde1fd 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -20,141 +20,165 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/mutex.h>
#include <net/flow.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/atomic.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
struct flow_cache_entry {
- struct flow_cache_entry *next;
- u16 family;
- u8 dir;
- struct flowi key;
- u32 genid;
- u32 sk_sid;
- void *object;
- atomic_t *object_ref;
+ union {
+ struct hlist_node hlist;
+ struct list_head gc_list;
+ } u;
+ struct net *net;
+ u16 family;
+ u8 dir;
+ u32 genid;
+ struct flowi key;
+ struct flow_cache_object *object;
};
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size (1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static kmem_cache_t *flow_cachep __read_mostly;
-
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
- int hash_rnd_recalc;
- u32 hash_rnd;
- int count;
-} ____cacheline_aligned;
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
- (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
- (per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
- (per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
-
struct flow_flush_info {
- atomic_t cpuleft;
- struct completion completion;
+ struct flow_cache *cache;
+ atomic_t cpuleft;
+ struct completion completion;
};
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+static struct kmem_cache *flow_cachep __read_mostly;
+
+#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
static void flow_cache_new_hashrnd(unsigned long arg)
{
+ struct flow_cache *fc = (void *) arg;
int i;
- for_each_cpu(i)
- flow_hash_rnd_recalc(i) = 1;
+ for_each_possible_cpu(i)
+ per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
- add_timer(&flow_hash_rnd_timer);
+ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&fc->rnd_timer);
}
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
- struct flow_cache_entry *fle, **flp;
- int i;
+ if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
+ return 0;
+ if (fle->object && !fle->object->ops->check(fle->object))
+ return 0;
+ return 1;
+}
- for (i = 0; i < flow_hash_size; i++) {
- int k = 0;
+static void flow_entry_kill(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
+{
+ if (fle->object)
+ fle->object->ops->delete(fle->object);
+ kmem_cache_free(flow_cachep, fle);
+}
- flp = &flow_table(cpu)[i];
- while ((fle = *flp) != NULL && k < shrink_to) {
- k++;
- flp = &fle->next;
- }
- while ((fle = *flp) != NULL) {
- *flp = fle->next;
- if (fle->object)
- atomic_dec(fle->object_ref);
- kmem_cache_free(flow_cachep, fle);
- flow_count(cpu)--;
- }
+static void flow_cache_gc_task(struct work_struct *work)
+{
+ struct list_head gc_list;
+ struct flow_cache_entry *fce, *n;
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+
+ INIT_LIST_HEAD(&gc_list);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+
+ list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+ flow_entry_kill(fce, xfrm);
+}
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+ int deleted, struct list_head *gc_list,
+ struct netns_xfrm *xfrm)
+{
+ if (deleted) {
+ fcp->hash_count -= deleted;
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+ schedule_work(&xfrm->flow_cache_gc_work);
}
}
-static void flow_cache_shrink(int cpu)
+static void __flow_cache_shrink(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp,
+ int shrink_to)
{
- int shrink_to = flow_lwm / flow_hash_size;
+ struct flow_cache_entry *fle;
+ struct hlist_node *tmp;
+ LIST_HEAD(gc_list);
+ int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
+
+ for (i = 0; i < flow_cache_hash_size(fc); i++) {
+ int saved = 0;
+
+ hlist_for_each_entry_safe(fle, tmp,
+ &fcp->hash_table[i], u.hlist) {
+ if (saved < shrink_to &&
+ flow_entry_valid(fle, xfrm)) {
+ saved++;
+ } else {
+ deleted++;
+ hlist_del(&fle->u.hlist);
+ list_add_tail(&fle->u.gc_list, &gc_list);
+ }
+ }
+ }
- __flow_cache_shrink(cpu, shrink_to);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
}
-static void flow_new_hash_rnd(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp)
{
- get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
- flow_hash_rnd_recalc(cpu) = 0;
+ int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
- __flow_cache_shrink(cpu, 0);
+ __flow_cache_shrink(fc, fcp, shrink_to);
}
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp)
{
- u32 *k = (u32 *) key;
-
- return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
- (flow_hash_size - 1));
+ get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+ fcp->hash_rnd_recalc = 0;
+ __flow_cache_shrink(fc, fcp, 0);
}
-#if (BITS_PER_LONG == 64)
-typedef u64 flow_compare_t;
-#else
-typedef u32 flow_compare_t;
-#endif
+static u32 flow_hash_code(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp,
+ const struct flowi *key,
+ size_t keysize)
+{
+ const u32 *k = (const u32 *) key;
+ const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
-extern void flowi_is_missized(void);
+ return jhash2(k, length, fcp->hash_rnd)
+ & (flow_cache_hash_size(fc) - 1);
+}
/* I hear what you're saying, use memcmp. But memcmp cannot make
- * important assumptions that we can here, such as alignment and
- * constant size.
+ * important assumptions that we can here, such as alignment.
*/
-static int flow_key_compare(struct flowi *key1, struct flowi *key2)
+static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
+ size_t keysize)
{
- flow_compare_t *k1, *k1_lim, *k2;
- const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
-
- if (sizeof(struct flowi) % sizeof(flow_compare_t))
- flowi_is_missized();
+ const flow_compare_t *k1, *k1_lim, *k2;
- k1 = (flow_compare_t *) key1;
- k1_lim = k1 + n_elem;
+ k1 = (const flow_compare_t *) key1;
+ k1_lim = k1 + keysize;
- k2 = (flow_compare_t *) key2;
+ k2 = (const flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
@@ -164,212 +188,324 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
return 0;
}
-void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
- flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
+ flow_resolve_t resolver, void *ctx)
{
- struct flow_cache_entry *fle, **head;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+ struct flow_cache_percpu *fcp;
+ struct flow_cache_entry *fle, *tfle;
+ struct flow_cache_object *flo;
+ size_t keysize;
unsigned int hash;
- int cpu;
local_bh_disable();
- cpu = smp_processor_id();
+ fcp = this_cpu_ptr(fc->percpu);
fle = NULL;
+ flo = NULL;
+
+ keysize = flow_key_size(family);
+ if (!keysize)
+ goto nocache;
+
/* Packet really early in init? Making flow_cache_init a
* pre-smp initcall would solve this. --RR */
- if (!flow_table(cpu))
+ if (!fcp->hash_table)
goto nocache;
- if (flow_hash_rnd_recalc(cpu))
- flow_new_hash_rnd(cpu);
- hash = flow_hash_code(key, cpu);
-
- head = &flow_table(cpu)[hash];
- for (fle = *head; fle; fle = fle->next) {
- if (fle->family == family &&
- fle->dir == dir &&
- fle->sk_sid == sk_sid &&
- flow_key_compare(key, &fle->key) == 0) {
- if (fle->genid == atomic_read(&flow_cache_genid)) {
- void *ret = fle->object;
+ if (fcp->hash_rnd_recalc)
+ flow_new_hash_rnd(fc, fcp);
- if (ret)
- atomic_inc(fle->object_ref);
- local_bh_enable();
-
- return ret;
- }
+ hash = flow_hash_code(fc, fcp, key, keysize);
+ hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
+ if (tfle->net == net &&
+ tfle->family == family &&
+ tfle->dir == dir &&
+ flow_key_compare(key, &tfle->key, keysize) == 0) {
+ fle = tfle;
break;
}
}
- if (!fle) {
- if (flow_count(cpu) > flow_hwm)
- flow_cache_shrink(cpu);
+ if (unlikely(!fle)) {
+ if (fcp->hash_count > fc->high_watermark)
+ flow_cache_shrink(fc, fcp);
- fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
+ fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
if (fle) {
- fle->next = *head;
- *head = fle;
+ fle->net = net;
fle->family = family;
fle->dir = dir;
- fle->sk_sid = sk_sid;
- memcpy(&fle->key, key, sizeof(*key));
+ memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
fle->object = NULL;
- flow_count(cpu)++;
+ hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
+ fcp->hash_count++;
}
+ } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
+ flo = fle->object;
+ if (!flo)
+ goto ret_object;
+ flo = flo->ops->get(flo);
+ if (flo)
+ goto ret_object;
+ } else if (fle->object) {
+ flo = fle->object;
+ flo->ops->delete(flo);
+ fle->object = NULL;
}
nocache:
- {
- void *obj;
- atomic_t *obj_ref;
-
- resolver(key, sk_sid, family, dir, &obj, &obj_ref);
-
- if (fle) {
- fle->genid = atomic_read(&flow_cache_genid);
-
- if (fle->object)
- atomic_dec(fle->object_ref);
-
- fle->object = obj;
- fle->object_ref = obj_ref;
- if (obj)
- atomic_inc(fle->object_ref);
- }
- local_bh_enable();
-
- return obj;
+ flo = NULL;
+ if (fle) {
+ flo = fle->object;
+ fle->object = NULL;
+ }
+ flo = resolver(net, key, family, dir, flo, ctx);
+ if (fle) {
+ fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
+ if (!IS_ERR(flo))
+ fle->object = flo;
+ else
+ fle->genid--;
+ } else {
+ if (!IS_ERR_OR_NULL(flo))
+ flo->ops->delete(flo);
}
+ret_object:
+ local_bh_enable();
+ return flo;
}
+EXPORT_SYMBOL(flow_cache_lookup);
static void flow_cache_flush_tasklet(unsigned long data)
{
struct flow_flush_info *info = (void *)data;
- int i;
- int cpu;
-
- cpu = smp_processor_id();
- for (i = 0; i < flow_hash_size; i++) {
- struct flow_cache_entry *fle;
-
- fle = flow_table(cpu)[i];
- for (; fle; fle = fle->next) {
- unsigned genid = atomic_read(&flow_cache_genid);
-
- if (!fle->object || fle->genid == genid)
+ struct flow_cache *fc = info->cache;
+ struct flow_cache_percpu *fcp;
+ struct flow_cache_entry *fle;
+ struct hlist_node *tmp;
+ LIST_HEAD(gc_list);
+ int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
+
+ fcp = this_cpu_ptr(fc->percpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++) {
+ hlist_for_each_entry_safe(fle, tmp,
+ &fcp->hash_table[i], u.hlist) {
+ if (flow_entry_valid(fle, xfrm))
continue;
- fle->object = NULL;
- atomic_dec(fle->object_ref);
+ deleted++;
+ hlist_del(&fle->u.hlist);
+ list_add_tail(&fle->u.gc_list, &gc_list);
}
}
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
+
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
}
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
+/*
+ * Return whether a cpu needs flushing. Conservatively, we assume
+ * the presence of any entries means the core may require flushing,
+ * since the flow_cache_ops.check() function may assume it's running
+ * on the same core as the per-cpu cache component.
+ */
+static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp;
+ int i;
+
+ fcp = per_cpu_ptr(fc->percpu, cpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++)
+ if (!hlist_empty(&fcp->hash_table[i]))
+ return 0;
+ return 1;
+}
+
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
- int cpu;
struct tasklet_struct *tasklet;
- cpu = smp_processor_id();
-
- tasklet = flow_flush_tasklet(cpu);
+ tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
{
struct flow_flush_info info;
- static DECLARE_MUTEX(flow_flush_sem);
+ cpumask_var_t mask;
+ int i, self;
+
+ /* Track which cpus need flushing to avoid disturbing all cores. */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return;
+ cpumask_clear(mask);
/* Don't want cpus going down or up during this. */
- lock_cpu_hotplug();
- down(&flow_flush_sem);
- atomic_set(&info.cpuleft, num_online_cpus());
+ get_online_cpus();
+ mutex_lock(&net->xfrm.flow_flush_sem);
+ info.cache = &net->xfrm.flow_cache_global;
+ for_each_online_cpu(i)
+ if (!flow_cache_percpu_empty(info.cache, i))
+ cpumask_set_cpu(i, mask);
+ atomic_set(&info.cpuleft, cpumask_weight(mask));
+ if (atomic_read(&info.cpuleft) == 0)
+ goto done;
+
init_completion(&info.completion);
local_bh_disable();
- smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
- flow_cache_flush_tasklet((unsigned long)&info);
+ self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
+ on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
+ if (self)
+ flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
- up(&flow_flush_sem);
- unlock_cpu_hotplug();
+
+done:
+ mutex_unlock(&net->xfrm.flow_flush_sem);
+ put_online_cpus();
+ free_cpumask_var(mask);
}
-static void __devinit flow_cache_cpu_prepare(int cpu)
+static void flow_cache_flush_task(struct work_struct *work)
{
- struct tasklet_struct *tasklet;
- unsigned long order;
-
- for (order = 0;
- (PAGE_SIZE << order) <
- (sizeof(struct flow_cache_entry *)*flow_hash_size);
- order++)
- /* NOTHING */;
-
- flow_table(cpu) = (struct flow_cache_entry **)
- __get_free_pages(GFP_KERNEL, order);
- if (!flow_table(cpu))
- panic("NET: failed to allocate flow cache order %lu\n", order);
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+ struct net *net = container_of(xfrm, struct net, xfrm);
- memset(flow_table(cpu), 0, PAGE_SIZE << order);
+ flow_cache_flush(net);
+}
- flow_hash_rnd_recalc(cpu) = 1;
- flow_count(cpu) = 0;
+void flow_cache_flush_deferred(struct net *net)
+{
+ schedule_work(&net->xfrm.flow_cache_flush_work);
+}
- tasklet = flow_flush_tasklet(cpu);
- tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
+static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+ size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
+
+ if (!fcp->hash_table) {
+ fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+ if (!fcp->hash_table) {
+ pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
+ return -ENOMEM;
+ }
+ fcp->hash_rnd_recalc = 1;
+ fcp->hash_count = 0;
+ tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+ }
+ return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- if (action == CPU_DEAD)
- __flow_cache_shrink((unsigned long)hcpu, 0);
+ struct flow_cache *fc = container_of(nfb, struct flow_cache,
+ hotcpu_notifier);
+ int res, cpu = (unsigned long) hcpu;
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ res = flow_cache_cpu_prepare(fc, cpu);
+ if (res)
+ return notifier_from_errno(res);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ __flow_cache_shrink(fc, fcp, 0);
+ break;
+ }
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
-static int __init flow_cache_init(void)
+int flow_cache_init(struct net *net)
{
int i;
-
- flow_cachep = kmem_cache_create("flow_cache",
- sizeof(struct flow_cache_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
if (!flow_cachep)
- panic("NET: failed to allocate flow cache slab\n");
-
- flow_hash_shift = 10;
- flow_lwm = 2 * flow_hash_size;
- flow_hwm = 4 * flow_hash_size;
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC, NULL);
+ spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+ INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+ INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+ INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+ mutex_init(&net->xfrm.flow_flush_sem);
+
+ fc->hash_shift = 10;
+ fc->low_watermark = 2 * flow_cache_hash_size(fc);
+ fc->high_watermark = 4 * flow_cache_hash_size(fc);
+
+ fc->percpu = alloc_percpu(struct flow_cache_percpu);
+ if (!fc->percpu)
+ return -ENOMEM;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(i) {
+ if (flow_cache_cpu_prepare(fc, i))
+ goto err;
+ }
+ fc->hotcpu_notifier = (struct notifier_block){
+ .notifier_call = flow_cache_cpu,
+ };
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
- init_timer(&flow_hash_rnd_timer);
- flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
- add_timer(&flow_hash_rnd_timer);
+ cpu_notifier_register_done();
- for_each_cpu(i)
- flow_cache_cpu_prepare(i);
+ setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+ (unsigned long) fc);
+ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&fc->rnd_timer);
- hotcpu_notifier(flow_cache_cpu, 0);
return 0;
+
+err:
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ cpu_notifier_register_done();
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+
+ return -ENOMEM;
}
+EXPORT_SYMBOL(flow_cache_init);
+
+void flow_cache_fini(struct net *net)
+{
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
-module_init(flow_cache_init);
+ del_timer_sync(&fc->rnd_timer);
+ unregister_hotcpu_notifier(&fc->hotcpu_notifier);
-EXPORT_SYMBOL(flow_cache_genid);
-EXPORT_SYMBOL(flow_cache_lookup);
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+}
+EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 00000000000..107ed12a532
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,405 @@
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <net/flow_keys.h>
+
+/* copy saddr & daddr, possibly using 64bit load/store
+ * Equivalent to : flow->src = iph->saddr;
+ * flow->dst = iph->daddr;
+ */
+static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+{
+ BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
+ offsetof(typeof(*flow), src) + sizeof(flow->src));
+ memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+}
+
+/**
+ * skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: buffer to extract the ports from
+ * @thoff: transport header offset
+ * @ip_proto: protocol for which to get port offset
+ *
+ * The function will try to retrieve the ports at offset thoff + poff where poff
+ * is the protocol port offset returned from proto_ports_offset
+ */
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
+{
+ int poff = proto_ports_offset(ip_proto);
+
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ ports = skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), &_ports);
+ if (ports)
+ return *ports;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_flow_get_ports);
+
+bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+{
+ int nhoff = skb_network_offset(skb);
+ u8 ip_proto;
+ __be16 proto = skb->protocol;
+
+ memset(flow, 0, sizeof(*flow));
+
+again:
+ switch (proto) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+ip:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph || iph->ihl < 5)
+ return false;
+ nhoff += iph->ihl * 4;
+
+ ip_proto = iph->protocol;
+ if (ip_is_fragment(iph))
+ ip_proto = 0;
+
+ iph_to_flow_copy_addrs(flow, iph);
+ break;
+ }
+ case htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+ipv6:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ ip_proto = iph->nexthdr;
+ flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+ flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+ nhoff += sizeof(struct ipv6hdr);
+ break;
+ }
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan;
+ struct vlan_hdr _vlan;
+
+ vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ if (!vlan)
+ return false;
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ goto again;
+ }
+ case htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case htons(PPP_IP):
+ goto ip;
+ case htons(PPP_IPV6):
+ goto ipv6;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ } *hdr, _hdr;
+
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ /*
+ * Only look inside GRE if version zero and no
+ * routing
+ */
+ if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
+ nhoff += 4;
+ if (hdr->flags & GRE_KEY)
+ nhoff += 4;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = skb_header_pointer(skb, nhoff,
+ sizeof(_eth), &_eth);
+ if (!eth)
+ return false;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+ }
+ goto again;
+ }
+ break;
+ }
+ case IPPROTO_IPIP:
+ proto = htons(ETH_P_IP);
+ goto ip;
+ case IPPROTO_IPV6:
+ proto = htons(ETH_P_IPV6);
+ goto ipv6;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
+ flow->thoff = (u16) nhoff;
+
+ return true;
+}
+EXPORT_SYMBOL(skb_flow_dissect);
+
+static u32 hashrnd __read_mostly;
+static __always_inline void __flow_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+{
+ __flow_hash_secret_init();
+ return jhash_3words(a, b, c, hashrnd);
+}
+
+static __always_inline u32 __flow_hash_1word(u32 a)
+{
+ __flow_hash_secret_init();
+ return jhash_1word(a, hashrnd);
+}
+
+/*
+ * __skb_get_hash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash(struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 hash;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return;
+
+ if (keys.ports)
+ skb->l4_hash = 1;
+
+ /* get a consistent hash (same value on both flow directions) */
+ if (((__force u32)keys.dst < (__force u32)keys.src) ||
+ (((__force u32)keys.dst == (__force u32)keys.src) &&
+ ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
+ swap(keys.dst, keys.src);
+ swap(keys.port16[0], keys.port16[1]);
+ }
+
+ hash = __flow_hash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports);
+ if (!hash)
+ hash = 1;
+
+ skb->hash = hash;
+}
+EXPORT_SYMBOL(__skb_get_hash);
+
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16) skb->protocol;
+ hash = __flow_hash_1word(hash);
+
+ return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
+/* __skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 __skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 poff = 0;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ poff += keys.thoff;
+ switch (keys.ip_proto) {
+ case IPPROTO_TCP: {
+ const struct tcphdr *tcph;
+ struct tcphdr _tcph;
+
+ tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[raw_smp_processor_id()]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else {
+ u32 hash;
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16) skb->protocol ^
+ skb->hash;
+ hash = __flow_hash_1word(hash);
+ queue_index = map->queues[
+ ((u64)hash * map->len) >> 32];
+ }
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index b07c029e821..6b5b6e7013c 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -14,8 +14,7 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
+#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -31,6 +30,8 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/gen_stats.h>
@@ -65,9 +66,9 @@
NOTES.
- * The stored value for avbps is scaled by 2^5, so that maximal
- rate is ~1Gbit, avpps is scaled by 2^10.
-
+ * avbps is scaled by 2^5, avpps is scaled by 2^10.
+ * both values are reported as 32 bit unsigned values. bps can
+ overflow for fast links : max speed being 34360Mbit/sec
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
for HZ=100 and HZ=1024 8)), maximal interval
is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
@@ -79,57 +80,110 @@
struct gen_estimator
{
- struct gen_estimator *next;
- struct gnet_stats_basic *bstats;
- struct gnet_stats_rate_est *rate_est;
+ struct list_head list;
+ struct gnet_stats_basic_packed *bstats;
+ struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
- unsigned interval;
int ewma_log;
u64 last_bytes;
+ u64 avbps;
u32 last_packets;
u32 avpps;
- u32 avbps;
+ struct rcu_head e_rcu;
+ struct rb_node node;
};
struct gen_estimator_head
{
struct timer_list timer;
- struct gen_estimator *list;
+ struct list_head list;
};
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-/* Estimator array lock */
+/* Protects against NULL dereference */
static DEFINE_RWLOCK(est_lock);
+/* Protects against soft lockup during large deletion */
+static struct rb_root est_root = RB_ROOT;
+static DEFINE_SPINLOCK(est_tree_lock);
+
static void est_timer(unsigned long arg)
{
int idx = (int)arg;
struct gen_estimator *e;
- read_lock(&est_lock);
- for (e = elist[idx].list; e; e = e->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
u64 nbytes;
+ u64 brate;
u32 npackets;
u32 rate;
spin_lock(e->stats_lock);
+ read_lock(&est_lock);
+ if (e->bstats == NULL)
+ goto skip;
+
nbytes = e->bstats->bytes;
npackets = e->bstats->packets;
- rate = (nbytes - e->last_bytes)<<(7 - idx);
+ brate = (nbytes - e->last_bytes)<<(7 - idx);
e->last_bytes = nbytes;
- e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+ e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
rate = (npackets - e->last_packets)<<(12 - idx);
e->last_packets = npackets;
- e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
+ e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+ read_unlock(&est_lock);
spin_unlock(e->stats_lock);
}
- mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
- read_unlock(&est_lock);
+ if (!list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+ rcu_read_unlock();
+}
+
+static void gen_add_node(struct gen_estimator *est)
+{
+ struct rb_node **p = &est_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct gen_estimator *e;
+
+ parent = *p;
+ e = rb_entry(parent, struct gen_estimator, node);
+
+ if (est->bstats > e->bstats)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&est->node, parent, p);
+ rb_insert_color(&est->node, &est_root);
+}
+
+static
+struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
+ const struct gnet_stats_rate_est64 *rate_est)
+{
+ struct rb_node *p = est_root.rb_node;
+
+ while (p) {
+ struct gen_estimator *e;
+
+ e = rb_entry(p, struct gen_estimator, node);
+
+ if (bstats > e->bstats)
+ p = p->rb_right;
+ else if (bstats < e->bstats || rate_est != e->rate_est)
+ p = p->rb_left;
+ else
+ return e;
+ }
+ return NULL;
}
/**
@@ -144,27 +198,30 @@ static void est_timer(unsigned long arg)
* configuration TLV is created. Upon each interval, the latest statistics
* will be read from &bstats and the estimated rate will be stored in
* &rate_est with the statistics lock grabed during this period.
- *
+ *
* Returns 0 on success or a negative error code.
+ *
*/
-int gen_new_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_rate_est64 *rate_est,
+ spinlock_t *stats_lock,
+ struct nlattr *opt)
{
struct gen_estimator *est;
- struct gnet_estimator *parm = RTA_DATA(opt);
+ struct gnet_estimator *parm = nla_data(opt);
+ int idx;
- if (RTA_PAYLOAD(opt) < sizeof(*parm))
+ if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
- est = kmalloc(sizeof(*est), GFP_KERNEL);
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
if (est == NULL)
return -ENOBUFS;
- memset(est, 0, sizeof(*est));
- est->interval = parm->interval + 2;
+ idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
@@ -174,57 +231,54 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
est->last_packets = bstats->packets;
est->avpps = rate_est->pps<<10;
- est->next = elist[est->interval].list;
- if (est->next == NULL) {
- init_timer(&elist[est->interval].timer);
- elist[est->interval].timer.data = est->interval;
- elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
- elist[est->interval].timer.function = est_timer;
- add_timer(&elist[est->interval].timer);
+ spin_lock_bh(&est_tree_lock);
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, idx);
}
- write_lock_bh(&est_lock);
- elist[est->interval].list = est;
- write_unlock_bh(&est_lock);
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+
+ list_add_rcu(&est->list, &elist[idx].list);
+ gen_add_node(est);
+ spin_unlock_bh(&est_tree_lock);
+
return 0;
}
+EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
* @bstats: basic statistics
* @rate_est: rate estimator statistics
*
- * Removes the rate estimator specified by &bstats and &rate_est
- * and deletes the timer.
+ * Removes the rate estimator specified by &bstats and &rate_est.
+ *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est)
+void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_rate_est64 *rate_est)
{
- int idx;
- struct gen_estimator *est, **pest;
-
- for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
- int killed = 0;
- pest = &elist[idx].list;
- while ((est=*pest) != NULL) {
- if (est->rate_est != rate_est || est->bstats != bstats) {
- pest = &est->next;
- continue;
- }
-
- write_lock_bh(&est_lock);
- *pest = est->next;
- write_unlock_bh(&est_lock);
-
- kfree(est);
- killed++;
- }
- if (killed && elist[idx].list == NULL)
- del_timer(&elist[idx].timer);
+ struct gen_estimator *e;
+
+ spin_lock_bh(&est_tree_lock);
+ while ((e = gen_find_node(bstats, rate_est))) {
+ rb_erase(&e->node, &est_root);
+
+ write_lock(&est_lock);
+ e->bstats = NULL;
+ write_unlock(&est_lock);
+
+ list_del_rcu(&e->list);
+ kfree_rcu(e, e_rcu);
}
+ spin_unlock_bh(&est_tree_lock);
}
+EXPORT_SYMBOL(gen_kill_estimator);
/**
- * gen_replace_estimator - replace rate estimator configruation
+ * gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
@@ -232,19 +286,36 @@ void gen_kill_estimator(struct gnet_stats_basic *bstats,
*
* Replaces the configuration of a rate estimator by calling
* gen_kill_estimator() and gen_new_estimator().
- *
+ *
* Returns 0 on success or a negative error code.
*/
-int
-gen_replace_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock,
- struct rtattr *opt)
+int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_rate_est64 *rate_est,
+ spinlock_t *stats_lock, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+ gen_kill_estimator(bstats, rate_est);
+ return gen_new_estimator(bstats, rate_est, stats_lock, opt);
}
-
-
-EXPORT_SYMBOL(gen_kill_estimator);
-EXPORT_SYMBOL(gen_new_estimator);
EXPORT_SYMBOL(gen_replace_estimator);
+
+/**
+ * gen_estimator_active - test if estimator is currently in use
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Returns true if estimator is active, and false if not.
+ */
+bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
+ const struct gnet_stats_rate_est64 *rate_est)
+{
+ bool res;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&est_tree_lock);
+ res = gen_find_node(bstats, rate_est) != NULL;
+ spin_unlock_bh(&est_tree_lock);
+
+ return res;
+}
+EXPORT_SYMBOL(gen_estimator_active);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 8f21490355f..9d3d9e78397 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -20,16 +20,18 @@
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/gen_stats.h>
+#include <net/netlink.h>
#include <net/gen_stats.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
{
- RTA_PUT(d->skb, type, size, buf);
+ if (nla_put(d->skb, type, size, buf))
+ goto nla_put_failure;
return 0;
-rtattr_failure:
+nla_put_failure:
spin_unlock_bh(d->lock);
return -1;
}
@@ -55,13 +57,14 @@ rtattr_failure:
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ __acquires(lock)
{
memset(d, 0, sizeof(*d));
-
+
spin_lock_bh(lock);
d->lock = lock;
if (type)
- d->tail = (struct rtattr *) skb->tail;
+ d->tail = (struct nlattr *)skb_tail_pointer(skb);
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
@@ -71,6 +74,7 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
return 0;
}
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
@@ -91,6 +95,7 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
{
return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
}
+EXPORT_SYMBOL(gnet_stats_start_copy);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
@@ -104,22 +109,29 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
+gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
{
if (d->compat_tc_stats) {
d->tc_stats.bytes = b->bytes;
d->tc_stats.packets = b->packets;
}
- if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b));
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = b->bytes;
+ sb.packets = b->packets;
+ return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
+ }
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
+ * @b: basic statistics
* @r: rate estimator statistics
*
* Appends the rate estimator statistics to the top level TLV created by
@@ -129,18 +141,36 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r)
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+ const struct gnet_stats_basic_packed *b,
+ struct gnet_stats_rate_est64 *r)
{
+ struct gnet_stats_rate_est est;
+ int res;
+
+ if (b && !gen_estimator_active(b, r))
+ return 0;
+
+ est.bps = min_t(u64, UINT_MAX, r->bps);
+ /* we have some time before reaching 2^32 packets per second */
+ est.pps = r->pps;
+
if (d->compat_tc_stats) {
- d->tc_stats.bps = r->bps;
- d->tc_stats.pps = r->pps;
+ d->tc_stats.bps = est.bps;
+ d->tc_stats.pps = est.pps;
}
- if (d->tail)
- return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+ if (d->tail) {
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+ if (res < 0 || est.bps == r->bps)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+ }
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
@@ -168,6 +198,7 @@ gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_queue);
/**
* gnet_stats_copy_app - copy application specific statistics into statistics TLV
@@ -195,6 +226,7 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
return 0;
}
+EXPORT_SYMBOL(gnet_stats_copy_app);
/**
* gnet_stats_finish_copy - finish dumping procedure
@@ -212,7 +244,7 @@ int
gnet_stats_finish_copy(struct gnet_dump *d)
{
if (d->tail)
- d->tail->rta_len = d->skb->tail - (u8 *) d->tail;
+ d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
if (d->compat_tc_stats)
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
@@ -228,12 +260,4 @@ gnet_stats_finish_copy(struct gnet_dump *d)
spin_unlock_bh(d->lock);
return 0;
}
-
-
-EXPORT_SYMBOL(gnet_stats_start_copy);
-EXPORT_SYMBOL(gnet_stats_start_copy_compat);
-EXPORT_SYMBOL(gnet_stats_copy_basic);
-EXPORT_SYMBOL(gnet_stats_copy_rate_est);
-EXPORT_SYMBOL(gnet_stats_copy_queue);
-EXPORT_SYMBOL(gnet_stats_copy_app);
EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 65e4b56fbc7..e1ec45ab1e6 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -18,10 +18,8 @@
#include <linux/errno.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/slab.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <asm/uaccess.h>
@@ -37,13 +35,15 @@
* in any case.
*/
-int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode)
+int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
{
- int size, err, ct;
-
- if (m->msg_namelen) {
+ int size, ct, err;
+
+ if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
- err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
+ void __user *namep;
+ namep = (void __user __force *) m->msg_name;
+ err = move_addr_to_kernel(namep, m->msg_namelen,
address);
if (err < 0)
return err;
@@ -51,102 +51,27 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode)
m->msg_name = address;
} else {
m->msg_name = NULL;
+ m->msg_namelen = 0;
}
size = m->msg_iovlen * sizeof(struct iovec);
- if (copy_from_user(iov, m->msg_iov, size))
+ if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
return -EFAULT;
m->msg_iov = iov;
err = 0;
for (ct = 0; ct < m->msg_iovlen; ct++) {
- err += iov[ct].iov_len;
- /*
- * Goal is not to verify user data, but to prevent returning
- * negative value, which is interpreted as errno.
- * Overflow is still possible, but it is harmless.
- */
- if (err < 0)
- return -EMSGSIZE;
- }
+ size_t len = iov[ct].iov_len;
- return err;
-}
-
-/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, iov->iov_len, len);
- if (copy_to_user(iov->iov_base, kdata, copy))
- return -EFAULT;
- kdata += copy;
- len -= copy;
- iov->iov_len -= copy;
- iov->iov_base += copy;
+ if (len > INT_MAX - err) {
+ len = INT_MAX - err;
+ iov[ct].iov_len = len;
}
- iov++;
+ err += len;
}
- return 0;
-}
-
-/*
- * Copy iovec to kernel. Returns -EFAULT on error.
- *
- * Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
- while (len > 0) {
- if (iov->iov_len) {
- int copy = min_t(unsigned int, len, iov->iov_len);
- if (copy_from_user(kdata, iov->iov_base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- iov++;
- }
-
- return 0;
-}
-
-/*
- * For use with ip_build_xmit
- */
-int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
- int len)
-{
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
- if (copy_from_user(kdata, base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov++;
- }
-
- return 0;
+ return err;
}
/*
@@ -158,9 +83,9 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
* call to this function will be unaligned also.
*/
int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
- int offset, unsigned int len, int *csump)
+ int offset, unsigned int len, __wsum *csump)
{
- int csum = *csump;
+ __wsum csum = *csump;
int partial_cnt = 0, err = 0;
/* Skip over the finished iovecs */
@@ -209,7 +134,7 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
if (partial_cnt) {
copy -= partial_cnt;
if (copy_from_user(kdata + copy, base + copy,
- partial_cnt))
+ partial_cnt))
goto out_fault;
}
}
@@ -224,7 +149,7 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
kdata += copy + partial_cnt;
iov++;
}
- *csump = csum;
+ *csump = csum;
out:
return err;
@@ -232,8 +157,28 @@ out_fault:
err = -EFAULT;
goto out;
}
-
EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
-EXPORT_SYMBOL(memcpy_fromiovec);
-EXPORT_SYMBOL(memcpy_fromiovecend);
-EXPORT_SYMBOL(memcpy_toiovec);
+
+unsigned long iov_pages(const struct iovec *iov, int offset,
+ unsigned long nr_segs)
+{
+ unsigned long seg, base;
+ int pages = 0, len, size;
+
+ while (nr_segs && (offset >= iov->iov_len)) {
+ offset -= iov->iov_len;
+ ++iov;
+ --nr_segs;
+ }
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ base = (unsigned long)iov[seg].iov_base + offset;
+ len = iov[seg].iov_len - offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ pages += size;
+ offset = 0;
+ }
+
+ return pages;
+}
+EXPORT_SYMBOL(iov_pages);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index d43d1201275..bd0767e6b2b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -11,7 +11,6 @@
*
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/if.h>
@@ -20,125 +19,235 @@
#include <linux/rtnetlink.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <asm/types.h>
enum lw_bits {
- LW_RUNNING = 0,
- LW_SE_USED
+ LW_URGENT = 0,
};
static unsigned long linkwatch_flags;
static unsigned long linkwatch_nextevent;
-static void linkwatch_event(void *dummy);
-static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL);
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
-struct lw_event {
- struct list_head list;
- struct net_device *dev;
-};
+static unsigned char default_operstate(const struct net_device *dev)
+{
+ if (!netif_carrier_ok(dev))
+ return (dev->ifindex != dev->iflink ?
+ IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
-/* Avoid kmalloc() for most systems */
-static struct lw_event singleevent;
+ if (netif_dormant(dev))
+ return IF_OPER_DORMANT;
-/* Must be called with the rtnl semaphore held */
-void linkwatch_run_queue(void)
+ return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
{
- LIST_HEAD(head);
- struct list_head *n, *next;
+ unsigned char operstate = default_operstate(dev);
- spin_lock_irq(&lweventlist_lock);
- list_splice_init(&lweventlist, &head);
- spin_unlock_irq(&lweventlist_lock);
+ if (operstate == dev->operstate)
+ return;
- list_for_each_safe(n, next, &head) {
- struct lw_event *event = list_entry(n, struct lw_event, list);
- struct net_device *dev = event->dev;
+ write_lock_bh(&dev_base_lock);
- if (event == &singleevent) {
- clear_bit(LW_SE_USED, &linkwatch_flags);
- } else {
- kfree(event);
- }
+ switch(dev->link_mode) {
+ case IF_LINK_MODE_DORMANT:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_DORMANT;
+ break;
- /* We are about to handle this device,
- * so new events can be accepted
- */
- clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+ case IF_LINK_MODE_DEFAULT:
+ default:
+ break;
+ }
- if (dev->flags & IFF_UP) {
- if (netif_carrier_ok(dev)) {
- WARN_ON(dev->qdisc_sleeping == &noop_qdisc);
- dev_activate(dev);
- } else
- dev_deactivate(dev);
+ dev->operstate = operstate;
- netdev_state_change(dev);
- }
+ write_unlock_bh(&dev_base_lock);
+}
- dev_put(dev);
- }
-}
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ rfc2863_policy(dev);
+}
-static void linkwatch_event(void *dummy)
+
+static bool linkwatch_urgent_event(struct net_device *dev)
{
- /* Limit the number of linkwatch events to one
- * per second so that a runaway driver does not
- * cause a storm of messages on the netlink
- * socket
- */
- linkwatch_nextevent = jiffies + HZ;
- clear_bit(LW_RUNNING, &linkwatch_flags);
-
- rtnl_shlock();
- linkwatch_run_queue();
- rtnl_shunlock();
+ if (!netif_running(dev))
+ return false;
+
+ if (dev->ifindex != dev->iflink)
+ return true;
+
+ if (dev->priv_flags & IFF_TEAM_PORT)
+ return true;
+
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
-void linkwatch_fire_event(struct net_device *dev)
+static void linkwatch_add_event(struct net_device *dev)
{
- if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
- unsigned long flags;
- struct lw_event *event;
-
- if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) {
- event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC);
-
- if (unlikely(event == NULL)) {
- clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
- return;
- }
- } else {
- event = &singleevent;
- }
+ unsigned long flags;
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (list_empty(&dev->link_watch_list)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
dev_hold(dev);
- event->dev = dev;
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(int urgent)
+{
+ unsigned long delay = linkwatch_nextevent - jiffies;
+
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* Minimise down-time: drop delay for up event. */
+ if (urgent) {
+ if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+ return;
+ delay = 0;
+ }
+
+ /* If we wrap around we'll delay it by at most HZ. */
+ if (delay > HZ)
+ delay = 0;
+
+ /*
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
+ */
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_wq, &linkwatch_work, 0);
+ else
+ schedule_delayed_work(&linkwatch_work, delay);
+}
+
- spin_lock_irqsave(&lweventlist_lock, flags);
- list_add_tail(&event->list, &lweventlist);
- spin_unlock_irqrestore(&lweventlist_lock, flags);
+static void linkwatch_do_dev(struct net_device *dev)
+{
+ /*
+ * Make sure the above read is complete since it can be
+ * rewritten as soon as we clear the bit below.
+ */
+ smp_mb__before_atomic();
+
+ /* We are about to handle this device,
+ * so new events can be accepted
+ */
+ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+ rfc2863_policy(dev);
+ if (dev->flags & IFF_UP) {
+ if (netif_carrier_ok(dev))
+ dev_activate(dev);
+ else
+ dev_deactivate(dev);
+
+ netdev_state_change(dev);
+ }
+ dev_put(dev);
+}
+
+static void __linkwatch_run_queue(int urgent_only)
+{
+ struct net_device *dev;
+ LIST_HEAD(wrk);
- if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) {
- unsigned long thisevent = jiffies;
+ /*
+ * Limit the number of linkwatch events to one
+ * per second so that a runaway driver does not
+ * cause a storm of messages on the netlink
+ * socket. This limit does not apply to up events
+ * while the device qdisc is down.
+ */
+ if (!urgent_only)
+ linkwatch_nextevent = jiffies + HZ;
+ /* Limit wrap-around effect on delay. */
+ else if (time_after(linkwatch_nextevent, jiffies + HZ))
+ linkwatch_nextevent = jiffies;
- if (thisevent >= linkwatch_nextevent) {
- schedule_work(&linkwatch_work);
- } else {
- schedule_delayed_work(&linkwatch_work, linkwatch_nextevent - thisevent);
- }
+ clear_bit(LW_URGENT, &linkwatch_flags);
+
+ spin_lock_irq(&lweventlist_lock);
+ list_splice_init(&lweventlist, &wrk);
+
+ while (!list_empty(&wrk)) {
+
+ dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+ list_del_init(&dev->link_watch_list);
+
+ if (urgent_only && !linkwatch_urgent_event(dev)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ continue;
}
+ spin_unlock_irq(&lweventlist_lock);
+ linkwatch_do_dev(dev);
+ spin_lock_irq(&lweventlist_lock);
}
+
+ if (!list_empty(&lweventlist))
+ linkwatch_schedule_work(0);
+ spin_unlock_irq(&lweventlist_lock);
}
+void linkwatch_forget_dev(struct net_device *dev)
+{
+ unsigned long flags;
+ int clean = 0;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (!list_empty(&dev->link_watch_list)) {
+ list_del_init(&dev->link_watch_list);
+ clean = 1;
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+ if (clean)
+ linkwatch_do_dev(dev);
+}
+
+
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+ __linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+ rtnl_lock();
+ __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+ rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+ bool urgent = linkwatch_urgent_event(dev);
+
+ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+ linkwatch_add_event(dev);
+ } else if (!urgent)
+ return;
+
+ linkwatch_schedule_work(urgent);
+}
EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e68700f950a..ef31fef25e5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -15,54 +15,50 @@
* Harald Welte Add neighbour cache statistics like rtstat
*/
-#include <linux/config.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
-#include <linux/sched.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/times.h>
+#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/dst.h>
#include <net/sock.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
#include <linux/string.h>
+#include <linux/log2.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#define DEBUG
#define NEIGH_DEBUG 1
-
-#define NEIGH_PRINTK(x...) printk(x)
-#define NEIGH_NOPRINTK(x...) do { ; } while(0)
-#define NEIGH_PRINTK0 NEIGH_PRINTK
-#define NEIGH_PRINTK1 NEIGH_NOPRINTK
-#define NEIGH_PRINTK2 NEIGH_NOPRINTK
-
-#if NEIGH_DEBUG >= 1
-#undef NEIGH_PRINTK1
-#define NEIGH_PRINTK1 NEIGH_PRINTK
-#endif
-#if NEIGH_DEBUG >= 2
-#undef NEIGH_PRINTK2
-#define NEIGH_PRINTK2 NEIGH_PRINTK
-#endif
+#define neigh_dbg(level, fmt, ...) \
+do { \
+ if (level <= NEIGH_DEBUG) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(unsigned long arg);
-#ifdef CONFIG_ARPD
-static void neigh_app_notify(struct neighbour *n);
-#endif
+static void __neigh_notify(struct neighbour *n, int type, int flags);
+static void neigh_update_notify(struct neighbour *neigh);
static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
-void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
static struct neigh_table *neigh_tables;
#ifdef CONFIG_PROC_FS
-static struct file_operations neigh_stat_seq_fops;
+static const struct file_operations neigh_stat_seq_fops;
#endif
/*
@@ -98,12 +94,21 @@ static struct file_operations neigh_stat_seq_fops;
static DEFINE_RWLOCK(neigh_tbl_lock);
-static int neigh_blackhole(struct sk_buff *skb)
+static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
}
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+ if (neigh->parms->neigh_cleanup)
+ neigh->parms->neigh_cleanup(neigh);
+
+ __neigh_notify(neigh, RTM_DELNEIGH, 0);
+ neigh_release(neigh);
+}
+
/*
* It is random distribution in the interval (1/2)*base...(3/2)*base.
* It corresponds to default IPv6 settings and is not overridable,
@@ -112,23 +117,29 @@ static int neigh_blackhole(struct sk_buff *skb)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return (base ? (net_random() % base) + (base >> 1) : 0);
+ return base ? (prandom_u32() % base) + (base >> 1) : 0;
}
+EXPORT_SYMBOL(neigh_rand_reach_time);
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
int i;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np;
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
+ np = &nht->hash_buckets[i];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
/* Neighbour record may be discarded if:
* - nobody refers to it.
* - it is not permanent
@@ -136,11 +147,13 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
- neigh_release(n);
+ neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
@@ -155,6 +168,16 @@ static int neigh_forced_gc(struct neigh_table *tbl)
return shrunk;
}
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ neigh_hold(n);
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
+
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
@@ -178,16 +201,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
{
int i;
+ struct neigh_hash_table *nht;
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np = &tbl->hash_buckets[i];
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- while ((n = *np) != NULL) {
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
}
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
@@ -202,16 +233,17 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
we must kill timers etc. and move
it to safe state.
*/
- skb_queue_purge(&n->arp_queue);
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
- NEIGH_PRINTK2("neigh %p is stray.\n", n);
+ neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
- neigh_release(n);
+ neigh_cleanup_and_release(n);
}
}
}
@@ -222,6 +254,7 @@ void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
neigh_flush_dev(tbl, dev);
write_unlock_bh(&tbl->lock);
}
+EXPORT_SYMBOL(neigh_changeaddr);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
@@ -234,8 +267,9 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
pneigh_queue_purge(&tbl->proxy_queue);
return 0;
}
+EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -250,21 +284,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);
+ n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
- memset(n, 0, tbl->entry_size);
-
- skb_queue_head_init(&n->arp_queue);
+ __skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
+ seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
- init_timer(&n->timer);
- n->timer.function = neigh_timer_handler;
- n->timer.data = (unsigned long)n;
+ setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
@@ -278,67 +310,93 @@ out_entries:
goto out;
}
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static void neigh_get_hash_rnd(u32 *x)
{
- unsigned long size = entries * sizeof(struct neighbour *);
- struct neighbour **ret;
+ get_random_bytes(x, sizeof(*x));
+ *x |= 1;
+}
- if (size <= PAGE_SIZE) {
- ret = kmalloc(size, GFP_ATOMIC);
- } else {
- ret = (struct neighbour **)
- __get_free_pages(GFP_ATOMIC, get_order(size));
- }
- if (ret)
- memset(ret, 0, size);
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+{
+ size_t size = (1 << shift) * sizeof(struct neighbour *);
+ struct neigh_hash_table *ret;
+ struct neighbour __rcu **buckets;
+ int i;
+ ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+ if (!ret)
+ return NULL;
+ if (size <= PAGE_SIZE)
+ buckets = kzalloc(size, GFP_ATOMIC);
+ else
+ buckets = (struct neighbour __rcu **)
+ __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+ get_order(size));
+ if (!buckets) {
+ kfree(ret);
+ return NULL;
+ }
+ ret->hash_buckets = buckets;
+ ret->hash_shift = shift;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free_rcu(struct rcu_head *head)
{
- unsigned long size = entries * sizeof(struct neighbour *);
+ struct neigh_hash_table *nht = container_of(head,
+ struct neigh_hash_table,
+ rcu);
+ size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
+ struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
- kfree(hash);
+ kfree(buckets);
else
- free_pages((unsigned long)hash, get_order(size));
+ free_pages((unsigned long)buckets, get_order(size));
+ kfree(nht);
}
-static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+ unsigned long new_shift)
{
- struct neighbour **new_hash, **old_hash;
- unsigned int i, new_hash_mask, old_entries;
+ unsigned int i, hash;
+ struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
- BUG_ON(new_entries & (new_entries - 1));
- new_hash = neigh_hash_alloc(new_entries);
- if (!new_hash)
- return;
-
- old_entries = tbl->hash_mask + 1;
- new_hash_mask = new_entries - 1;
- old_hash = tbl->hash_buckets;
+ old_nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ new_nht = neigh_hash_alloc(new_shift);
+ if (!new_nht)
+ return old_nht;
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
- for (i = 0; i < old_entries; i++) {
+ for (i = 0; i < (1 << old_nht->hash_shift); i++) {
struct neighbour *n, *next;
- for (n = old_hash[i]; n; n = next) {
- unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
-
- hash_val &= new_hash_mask;
- next = n->next;
-
- n->next = new_hash[hash_val];
- new_hash[hash_val] = n;
+ for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+ lockdep_is_held(&tbl->lock));
+ n != NULL;
+ n = next) {
+ hash = tbl->hash(n->primary_key, n->dev,
+ new_nht->hash_rnd);
+
+ hash >>= (32 - new_nht->hash_shift);
+ next = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(
+ new_nht->hash_buckets[hash],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(new_nht->hash_buckets[hash], n);
}
}
- tbl->hash_buckets = new_hash;
- tbl->hash_mask = new_hash_mask;
- neigh_hash_free(old_hash, old_entries);
+ rcu_assign_pointer(tbl->nht, new_nht);
+ call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+ return new_nht;
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -346,49 +404,70 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
{
struct neighbour *n;
int key_len = tbl->key_len;
- u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
-
+ u32 hash_val;
+ struct neigh_hash_table *nht;
+
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
- read_unlock_bh(&tbl->lock);
+
+ rcu_read_unlock_bh();
return n;
}
+EXPORT_SYMBOL(neigh_lookup);
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
+struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
+ const void *pkey)
{
struct neighbour *n;
int key_len = tbl->key_len;
- u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask;
+ u32 hash_val;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
- if (!memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (!memcmp(n->primary_key, pkey, key_len) &&
+ net_eq(dev_net(n->dev), net)) {
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
- read_unlock_bh(&tbl->lock);
+
+ rcu_read_unlock_bh();
return n;
}
+EXPORT_SYMBOL(neigh_lookup_nodev);
-struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev)
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
+ struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
@@ -405,6 +484,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -412,34 +499,44 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
- n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+ n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
- neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+ if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+ nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
- for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+ for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock));
+ n1 != NULL;
+ n1 = rcu_dereference_protected(n1->next,
+ lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
- neigh_hold(n1);
+ if (want_ref)
+ neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
- n->next = tbl->hash_buckets[hash_val];
- tbl->hash_buckets[hash_val] = n;
n->dead = 0;
- neigh_hold(n);
+ if (want_ref)
+ neigh_hold(n);
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK2("neigh %p is created.\n", n);
+ neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
@@ -449,37 +546,68 @@ out_neigh_release:
neigh_release(n);
goto out;
}
+EXPORT_SYMBOL(__neigh_create);
-struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev, int creat)
+static u32 pneigh_hash(const void *pkey, int key_len)
{
- struct pneigh_entry *n;
- int key_len = tbl->key_len;
u32 hash_val = *(u32 *)(pkey + key_len - 4);
-
hash_val ^= (hash_val >> 16);
hash_val ^= hash_val >> 8;
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
+ return hash_val;
+}
- read_lock_bh(&tbl->lock);
-
- for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
+static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
+ struct net *net,
+ const void *pkey,
+ int key_len,
+ struct net_device *dev)
+{
+ while (n) {
if (!memcmp(n->key, pkey, key_len) &&
- (n->dev == dev || !n->dev)) {
- read_unlock_bh(&tbl->lock);
- goto out;
- }
+ net_eq(pneigh_net(n), net) &&
+ (n->dev == dev || !n->dev))
+ return n;
+ n = n->next;
}
+ return NULL;
+}
+
+struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey, struct net_device *dev)
+{
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+}
+EXPORT_SYMBOL_GPL(__pneigh_lookup);
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev, int creat)
+{
+ struct pneigh_entry *n;
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ read_lock_bh(&tbl->lock);
+ n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
read_unlock_bh(&tbl->lock);
- n = NULL;
- if (!creat)
+
+ if (n || !creat)
goto out;
+ ASSERT_RTNL();
+
n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (!n)
goto out;
+ write_pnet(&n->net, hold_net(net));
memcpy(n->key, pkey, key_len);
n->dev = dev;
if (dev)
@@ -488,6 +616,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
if (tbl->pconstructor && tbl->pconstructor(n)) {
if (dev)
dev_put(dev);
+ release_net(net);
kfree(n);
n = NULL;
goto out;
@@ -500,30 +629,28 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
out:
return n;
}
+EXPORT_SYMBOL(pneigh_lookup);
-int pneigh_delete(struct neigh_table *tbl, const void *pkey,
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
struct net_device *dev)
{
struct pneigh_entry *n, **np;
int key_len = tbl->key_len;
- u32 hash_val = *(u32 *)(pkey + key_len - 4);
-
- hash_val ^= (hash_val >> 16);
- hash_val ^= hash_val >> 8;
- hash_val ^= hash_val >> 4;
- hash_val &= PNEIGH_HASHMASK;
+ u32 hash_val = pneigh_hash(pkey, key_len);
write_lock_bh(&tbl->lock);
for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
np = &n->next) {
- if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
+ if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+ net_eq(pneigh_net(n), net)) {
*np = n->next;
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
+ release_net(pneigh_net(n));
kfree(n);
return 0;
}
@@ -546,6 +673,7 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
+ release_net(pneigh_net(n));
kfree(n);
continue;
}
@@ -555,6 +683,13 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
return -ENOENT;
}
+static void neigh_parms_destroy(struct neigh_parms *parms);
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+ if (atomic_dec_and_test(&parms->refcnt))
+ neigh_parms_destroy(parms);
+}
/*
* neighbour must already be out of the table;
@@ -562,43 +697,36 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
*/
void neigh_destroy(struct neighbour *neigh)
{
- struct hh_cache *hh;
+ struct net_device *dev = neigh->dev;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
- printk(KERN_WARNING
- "Destroying alive neighbour %p\n", neigh);
+ pr_warn("Destroying alive neighbour %p\n", neigh);
dump_stack();
return;
}
if (neigh_del_timer(neigh))
- printk(KERN_WARNING "Impossible event.\n");
+ pr_warn("Impossible event\n");
- while ((hh = neigh->hh) != NULL) {
- neigh->hh = hh->hh_next;
- hh->hh_next = NULL;
- write_lock_bh(&hh->hh_lock);
- hh->hh_output = neigh_blackhole;
- write_unlock_bh(&hh->hh_lock);
- if (atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
- }
-
- if (neigh->ops && neigh->ops->destructor)
- (neigh->ops->destructor)(neigh);
+ write_lock_bh(&neigh->lock);
+ __skb_queue_purge(&neigh->arp_queue);
+ write_unlock_bh(&neigh->lock);
+ neigh->arp_queue_len_bytes = 0;
- skb_queue_purge(&neigh->arp_queue);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
- dev_put(neigh->dev);
+ dev_put(dev);
neigh_parms_put(neigh->parms);
- NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+ neigh_dbg(2, "neigh %p is destroyed\n", neigh);
atomic_dec(&neigh->tbl->entries);
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+ kfree_rcu(neigh, rcu);
}
+EXPORT_SYMBOL(neigh_destroy);
/* Neighbour state is suspicious;
disable fast path.
@@ -607,14 +735,9 @@ void neigh_destroy(struct neighbour *neigh)
*/
static void neigh_suspect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->output = neigh->ops->output;
-
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->output;
}
/* Neighbour state is OK;
@@ -624,99 +747,137 @@ static void neigh_suspect(struct neighbour *neigh)
*/
static void neigh_connect(struct neighbour *neigh)
{
- struct hh_cache *hh;
-
- NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+ neigh_dbg(2, "neigh %p is connected\n", neigh);
neigh->output = neigh->ops->connected_output;
-
- for (hh = neigh->hh; hh; hh = hh->hh_next)
- hh->hh_output = neigh->ops->hh_output;
}
-static void neigh_periodic_timer(unsigned long arg)
+static void neigh_periodic_work(struct work_struct *work)
{
- struct neigh_table *tbl = (struct neigh_table *)arg;
- struct neighbour *n, **np;
- unsigned long expire, now = jiffies;
+ struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+ unsigned int i;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock(&tbl->lock);
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
*/
- if (time_after(now, tbl->last_rand + 300 * HZ)) {
+ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
- tbl->last_rand = now;
+ tbl->last_rand = jiffies;
for (p = &tbl->parms; p; p = p->next)
p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
- np = &tbl->hash_buckets[tbl->hash_chain_gc];
- tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
+ if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ goto out;
+
+ for (i = 0 ; i < (1 << nht->hash_shift); i++) {
+ np = &nht->hash_buckets[i];
- while ((n = *np) != NULL) {
- unsigned int state;
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ unsigned int state;
- write_lock(&n->lock);
+ write_lock(&n->lock);
- state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
- write_unlock(&n->lock);
- goto next_elt;
- }
+ state = n->nud_state;
+ if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ write_unlock(&n->lock);
+ goto next_elt;
+ }
- if (time_before(n->used, n->confirmed))
- n->used = n->confirmed;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
- if (atomic_read(&n->refcnt) == 1 &&
- (state == NUD_FAILED ||
- time_after(now, n->used + n->parms->gc_staletime))) {
- *np = n->next;
- n->dead = 1;
+ if (atomic_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ *np = n->next;
+ n->dead = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
write_unlock(&n->lock);
- neigh_release(n);
- continue;
- }
- write_unlock(&n->lock);
next_elt:
- np = &n->next;
+ np = &n->next;
+ }
+ /*
+ * It's fine to release lock here, even if hash table
+ * grows while we are preempted.
+ */
+ write_unlock_bh(&tbl->lock);
+ cond_resched();
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
}
-
- /* Cycle through all hash buckets every base_reachable_time/2 ticks.
- * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
- * base_reachable_time.
+out:
+ /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
+ * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
+ * BASE_REACHABLE_TIME.
*/
- expire = tbl->parms.base_reachable_time >> 1;
- expire /= (tbl->hash_mask + 1);
- if (!expire)
- expire = 1;
-
- mod_timer(&tbl->gc_timer, now + expire);
-
- write_unlock(&tbl->lock);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
+ write_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE ?
- p->ucast_probes :
- p->ucast_probes + p->app_probes + p->mcast_probes);
+ int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
+ if (!(n->nud_state & NUD_PROBE))
+ max_probes += NEIGH_VAR(p, MCAST_PROBES);
+ return max_probes;
}
-static inline void neigh_add_timer(struct neighbour *n, unsigned long when)
+static void neigh_invalidate(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
{
- if (unlikely(mod_timer(&n->timer, when))) {
- printk("NEIGH: BUG, double timer add, state is %x\n",
- n->nud_state);
- dump_stack();
+ struct sk_buff *skb;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+ neigh_dbg(2, "neigh %p is failed\n", neigh);
+ neigh->updated = jiffies;
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while (neigh->nud_state == NUD_FAILED &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ write_unlock(&neigh->lock);
+ neigh->ops->error_report(neigh, skb);
+ write_lock(&neigh->lock);
}
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_copy(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ kfree_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
@@ -725,7 +886,7 @@ static void neigh_timer_handler(unsigned long arg)
{
unsigned long now, next;
struct neighbour *neigh = (struct neighbour *)arg;
- unsigned state;
+ unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
@@ -734,68 +895,57 @@ static void neigh_timer_handler(unsigned long arg)
now = jiffies;
next = now + HZ;
- if (!(state & NUD_IN_TIMER)) {
-#ifndef CONFIG_SMP
- printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
-#endif
+ if (!(state & NUD_IN_TIMER))
goto out;
- }
if (state & NUD_REACHABLE) {
- if (time_before_eq(now,
+ if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
- NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+ neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
- neigh->used + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh->used +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
neigh_suspect(neigh);
- next = now + neigh->parms->delay_probe_time;
+ next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
- NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->nud_state = NUD_STALE;
+ neigh->updated = jiffies;
neigh_suspect(neigh);
+ notify = 1;
}
} else if (state & NUD_DELAY) {
- if (time_before_eq(now,
- neigh->confirmed + neigh->parms->delay_probe_time)) {
- NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
+ if (time_before_eq(now,
+ neigh->confirmed +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is now reachable\n", neigh);
neigh->nud_state = NUD_REACHABLE;
+ neigh->updated = jiffies;
neigh_connect(neigh);
+ notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
- NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
+ neigh_dbg(2, "neigh %p is probed\n", neigh);
neigh->nud_state = NUD_PROBE;
+ neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
- next = now + neigh->parms->retrans_time;
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + neigh->parms->retrans_time;
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
- struct sk_buff *skb;
-
neigh->nud_state = NUD_FAILED;
notify = 1;
- NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
- NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
-
- /* It is very thin place. report_unreachable is very complicated
- routine. Particularly, it can hit the same neighbour entry!
-
- So that, we try to be accurate and avoid dead loop. --ANK
- */
- while (neigh->nud_state == NUD_FAILED &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- write_unlock(&neigh->lock);
- neigh->ops->error_report(neigh, skb);
- write_lock(&neigh->lock);
- }
- skb_queue_purge(&neigh->arp_queue);
+ neigh_invalidate(neigh);
+ goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
@@ -805,31 +955,22 @@ static void neigh_timer_handler(unsigned long arg)
neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
- struct sk_buff *skb = skb_peek(&neigh->arp_queue);
- /* keep skb alive even if arp_queue overflows */
- if (skb)
- skb_get(skb);
- write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
- atomic_inc(&neigh->probes);
- if (skb)
- kfree_skb(skb);
+ neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
}
-#ifdef CONFIG_ARPD
- if (notify && neigh->parms->app_probes)
- neigh_app_notify(neigh);
-#endif
+ if (notify)
+ neigh_update_notify(neigh);
+
neigh_release(neigh);
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
- unsigned long now;
+ bool immediate_probe = false;
write_lock_bh(&neigh->lock);
@@ -837,59 +978,79 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
- now = jiffies;
-
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
- if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
- atomic_set(&neigh->probes, neigh->parms->ucast_probes);
+ if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
+ NEIGH_VAR(neigh->parms, APP_PROBES)) {
+ unsigned long next, now = jiffies;
+
+ atomic_set(&neigh->probes,
+ NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh->nud_state = NUD_INCOMPLETE;
- neigh_hold(neigh);
- neigh_add_timer(neigh, now + 1);
+ neigh->updated = now;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/2);
+ neigh_add_timer(neigh, next);
+ immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
+ neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
- NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
- neigh_hold(neigh);
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
- neigh_add_timer(neigh,
- jiffies + neigh->parms->delay_probe_time);
+ neigh->updated = jiffies;
+ neigh_add_timer(neigh, jiffies +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
- if (skb_queue_len(&neigh->arp_queue) >=
- neigh->parms->queue_len) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
- buff = neigh->arp_queue.next;
- __skb_unlink(buff, &neigh->arp_queue);
+
+ buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
+ NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
+ skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
- write_unlock_bh(&neigh->lock);
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
return rc;
}
+EXPORT_SYMBOL(__neigh_event_send);
-static __inline__ void neigh_update_hhs(struct neighbour *neigh)
+static void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
- void (*update)(struct hh_cache*, struct net_device*, unsigned char *) =
- neigh->dev->header_cache_update;
+ void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+ = NULL;
+
+ if (neigh->dev->header_ops)
+ update = neigh->dev->header_ops->cache_update;
if (update) {
- for (hh = neigh->hh; hh; hh = hh->hh_next) {
- write_lock_bh(&hh->hh_lock);
+ hh = &neigh->hh;
+ if (hh->hh_len) {
+ write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
}
}
}
@@ -903,13 +1064,13 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh)
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
- lladdr instead of overriding it
+ lladdr instead of overriding it
if it is different.
It also allows to retain current state
if lladdr is unchanged.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
@@ -922,9 +1083,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
{
u8 old;
int err;
-#ifdef CONFIG_ARPD
int notify = 0;
-#endif
struct net_device *dev;
int update_isrouter = 0;
@@ -934,7 +1093,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
old = neigh->nud_state;
err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
@@ -944,9 +1103,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh_suspect(neigh);
neigh->nud_state = new;
err = 0;
-#ifdef CONFIG_ARPD
notify = old & NUD_VALID;
-#endif
+ if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ (new & NUD_FAILED)) {
+ neigh_invalidate(neigh);
+ notify = 1;
+ }
goto out;
}
@@ -960,7 +1122,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- compare new & old
- if they are different, check override flag
*/
- if ((old & NUD_VALID) &&
+ if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
@@ -1002,25 +1164,24 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (new != old) {
neigh_del_timer(neigh);
- if (new & NUD_IN_TIMER) {
- neigh_hold(neigh);
- neigh_add_timer(neigh, (jiffies +
- ((new & NUD_REACHABLE) ?
+ if (new & NUD_IN_TIMER)
+ neigh_add_timer(neigh, (jiffies +
+ ((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
- }
neigh->nud_state = new;
+ notify = 1;
}
if (lladdr != neigh->ha) {
+ write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
+ write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
- (neigh->parms->base_reachable_time << 1);
-#ifdef CONFIG_ARPD
+ (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
-#endif
}
if (new == old)
goto out;
@@ -1035,15 +1196,34 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct neighbour *n1 = neigh;
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
- /* On shaper/eql skb->dst->neighbour != neigh :( */
- if (skb->dst && skb->dst->neighbour)
- n1 = skb->dst->neighbour;
- n1->output(skb);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ n1->output(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
write_lock_bh(&neigh->lock);
}
- skb_queue_purge(&neigh->arp_queue);
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
@@ -1052,12 +1232,28 @@ out:
(neigh->flags & ~NTF_ROUTER);
}
write_unlock_bh(&neigh->lock);
-#ifdef CONFIG_ARPD
- if (notify && neigh->parms->app_probes)
- neigh_app_notify(neigh);
-#endif
+
+ if (notify)
+ neigh_update_notify(neigh);
+
return err;
}
+EXPORT_SYMBOL(neigh_update);
+
+/* Update the neigh to listen temporarily for probe responses, even if it is
+ * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
+ */
+void __neigh_set_probe_once(struct neighbour *neigh)
+{
+ neigh->updated = jiffies;
+ if (!(neigh->nud_state & NUD_FAILED))
+ return;
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
+ neigh_add_timer(neigh,
+ jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+}
+EXPORT_SYMBOL(__neigh_set_probe_once);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
@@ -1066,158 +1262,145 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
lladdr || !dev->addr_len);
if (neigh)
- neigh_update(neigh, lladdr, NUD_STALE,
+ neigh_update(neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_OVERRIDE);
return neigh;
}
+EXPORT_SYMBOL(neigh_event_ns);
-static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
- u16 protocol)
+/* called with read_lock_bh(&n->lock); */
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
{
- struct hh_cache *hh;
struct net_device *dev = dst->dev;
+ __be16 prot = dst->ops->protocol;
+ struct hh_cache *hh = &n->hh;
- for (hh = n->hh; hh; hh = hh->hh_next)
- if (hh->hh_type == protocol)
- break;
+ write_lock_bh(&n->lock);
- if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
- memset(hh, 0, sizeof(struct hh_cache));
- rwlock_init(&hh->hh_lock);
- hh->hh_type = protocol;
- atomic_set(&hh->hh_refcnt, 0);
- hh->hh_next = NULL;
- if (dev->hard_header_cache(n, hh)) {
- kfree(hh);
- hh = NULL;
- } else {
- atomic_inc(&hh->hh_refcnt);
- hh->hh_next = n->hh;
- n->hh = hh;
- if (n->nud_state & NUD_CONNECTED)
- hh->hh_output = n->ops->hh_output;
- else
- hh->hh_output = n->ops->output;
- }
- }
- if (hh) {
- atomic_inc(&hh->hh_refcnt);
- dst->hh = hh;
- }
+ /* Only one thread can come in here and initialize the
+ * hh_cache entry.
+ */
+ if (!hh->hh_len)
+ dev->header_ops->cache(n, hh, prot);
+
+ write_unlock_bh(&n->lock);
}
/* This function can be used in contexts, where only old dev_queue_xmit
- worked, f.e. if you want to override normal output path (eql, shaper),
- but resolution is not made yet.
+ * worked, f.e. if you want to override normal output path (eql, shaper),
+ * but resolution is not made yet.
*/
-int neigh_compat_output(struct sk_buff *skb)
+int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- __skb_pull(skb, skb->nh.raw - skb->data);
+ __skb_pull(skb, skb_network_offset(skb));
- if (dev->hard_header &&
- dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
- skb->len) < 0 &&
- dev->rebuild_header(skb))
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
+ skb->len) < 0 &&
+ dev_rebuild_header(skb))
return 0;
return dev_queue_xmit(skb);
}
+EXPORT_SYMBOL(neigh_compat_output);
/* Slow and careful. */
-int neigh_resolve_output(struct sk_buff *skb)
+int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh;
+ struct dst_entry *dst = skb_dst(skb);
int rc = 0;
- if (!dst || !(neigh = dst->neighbour))
+ if (!dst)
goto discard;
- __skb_pull(skb, skb->nh.raw - skb->data);
-
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
- if (dev->hard_header_cache && !dst->hh) {
- write_lock_bh(&neigh->lock);
- if (!dst->hh)
- neigh_hh_init(neigh, dst, dst->ops->protocol);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- write_unlock_bh(&neigh->lock);
- } else {
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
- }
+ unsigned int seq;
+
+ if (dev->header_ops->cache && !neigh->hh.hh_len)
+ neigh_hh_init(neigh, dst);
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
if (err >= 0)
- rc = neigh->ops->queue_xmit(skb);
+ rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
discard:
- NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
- dst, dst ? dst->neighbour : NULL);
+ neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
+EXPORT_SYMBOL(neigh_resolve_output);
/* As fast as possible without hh cache */
-int neigh_connected_output(struct sk_buff *skb)
+int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
- int err;
- struct dst_entry *dst = skb->dst;
- struct neighbour *neigh = dst->neighbour;
struct net_device *dev = neigh->dev;
+ unsigned int seq;
+ int err;
- __skb_pull(skb, skb->nh.raw - skb->data);
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
if (err >= 0)
- err = neigh->ops->queue_xmit(skb);
+ err = dev_queue_xmit(skb);
else {
err = -EINVAL;
kfree_skb(skb);
}
return err;
}
+EXPORT_SYMBOL(neigh_connected_output);
+
+int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_direct_output);
static void neigh_proxy_process(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
long sched_next = 0;
unsigned long now = jiffies;
- struct sk_buff *skb;
+ struct sk_buff *skb, *n;
spin_lock(&tbl->proxy_queue.lock);
- skb = tbl->proxy_queue.next;
-
- while (skb != (struct sk_buff *)&tbl->proxy_queue) {
- struct sk_buff *back = skb;
- long tdif = NEIGH_CB(back)->sched_next - now;
+ skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+ long tdif = NEIGH_CB(skb)->sched_next - now;
- skb = skb->next;
if (tdif <= 0) {
- struct net_device *dev = back->dev;
- __skb_unlink(back, &tbl->proxy_queue);
- if (tbl->proxy_redo && netif_running(dev))
- tbl->proxy_redo(back);
- else
- kfree_skb(back);
+ struct net_device *dev = skb->dev;
+
+ __skb_unlink(skb, &tbl->proxy_queue);
+ if (tbl->proxy_redo && netif_running(dev)) {
+ rcu_read_lock();
+ tbl->proxy_redo(skb);
+ rcu_read_unlock();
+ } else {
+ kfree_skb(skb);
+ }
dev_put(dev);
} else if (!sched_next || tdif < sched_next)
@@ -1233,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
unsigned long now = jiffies;
- unsigned long sched_next = now + (net_random() % p->proxy_delay);
- if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+ unsigned long sched_next = now + (prandom_u32() %
+ NEIGH_VAR(p, PROXY_DELAY));
+
+ if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1248,44 +1433,63 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
- dst_release(skb->dst);
- skb->dst = NULL;
+ skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
+EXPORT_SYMBOL(pneigh_enqueue);
+
+static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
+ struct net *net, int ifindex)
+{
+ struct neigh_parms *p;
+
+ for (p = &tbl->parms; p; p = p->next) {
+ if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+ (!p->dev && !ifindex && net_eq(net, &init_net)))
+ return p;
+ }
+ return NULL;
+}
struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
struct neigh_table *tbl)
{
- struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL);
+ struct neigh_parms *p;
+ struct net *net = dev_net(dev);
+ const struct net_device_ops *ops = dev->netdev_ops;
+ p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
- memcpy(p, &tbl->parms, sizeof(*p));
p->tbl = tbl;
atomic_set(&p->refcnt, 1);
- INIT_RCU_HEAD(&p->rcu_head);
p->reachable_time =
- neigh_rand_reach_time(p->base_reachable_time);
- if (dev) {
- if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
- kfree(p);
- return NULL;
- }
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ dev_hold(dev);
+ p->dev = dev;
+ write_pnet(&p->net, hold_net(net));
+ p->sysctl_table = NULL;
- dev_hold(dev);
- p->dev = dev;
+ if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+ release_net(net);
+ dev_put(dev);
+ kfree(p);
+ return NULL;
}
- p->sysctl_table = NULL;
+
write_lock_bh(&tbl->lock);
p->next = tbl->parms.next;
tbl->parms.next = p;
write_unlock_bh(&tbl->lock);
+
+ neigh_parms_data_state_cleanall(p);
}
return p;
}
+EXPORT_SYMBOL(neigh_parms_alloc);
static void neigh_rcu_free_parms(struct rcu_head *head)
{
@@ -1314,90 +1518,97 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
}
}
write_unlock_bh(&tbl->lock);
- NEIGH_PRINTK1("neigh_parms_release: not found\n");
+ neigh_dbg(1, "%s: not found\n", __func__);
}
+EXPORT_SYMBOL(neigh_parms_release);
-void neigh_parms_destroy(struct neigh_parms *parms)
+static void neigh_parms_destroy(struct neigh_parms *parms)
{
+ release_net(neigh_parms_net(parms));
kfree(parms);
}
+static struct lock_class_key neigh_table_proxy_queue_class;
-void neigh_table_init(struct neigh_table *tbl)
+static void neigh_table_init_no_netlink(struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
+ write_pnet(&tbl->parms.net, &init_net);
atomic_set(&tbl->parms.refcnt, 1);
- INIT_RCU_HEAD(&tbl->parms.rcu_head);
tbl->parms.reachable_time =
- neigh_rand_reach_time(tbl->parms.base_reachable_time);
-
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep = kmem_cache_create(tbl->id,
- tbl->entry_size,
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
-
- if (!tbl->kmem_cachep)
- panic("cannot create neighbour cache");
+ neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
-
+
#ifdef CONFIG_PROC_FS
- tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat);
- if (!tbl->pde)
+ if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
+ &neigh_stat_seq_fops, tbl))
panic("cannot create neighbour proc dir entry");
- tbl->pde->proc_fops = &neigh_stat_seq_fops;
- tbl->pde->data = tbl;
#endif
- tbl->hash_mask = 1;
- tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
- tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL);
+ tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
- if (!tbl->hash_buckets || !tbl->phash_buckets)
+ if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
- memset(tbl->phash_buckets, 0, phsize);
-
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ if (!tbl->entry_size)
+ tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
+ tbl->key_len, NEIGH_PRIV_ALIGN);
+ else
+ WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
rwlock_init(&tbl->lock);
- init_timer(&tbl->gc_timer);
- tbl->gc_timer.data = (unsigned long)tbl;
- tbl->gc_timer.function = neigh_periodic_timer;
- tbl->gc_timer.expires = now + 1;
- add_timer(&tbl->gc_timer);
-
- init_timer(&tbl->proxy_timer);
- tbl->proxy_timer.data = (unsigned long)tbl;
- tbl->proxy_timer.function = neigh_proxy_process;
- skb_queue_head_init(&tbl->proxy_queue);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ tbl->parms.reachable_time);
+ setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+ skb_queue_head_init_class(&tbl->proxy_queue,
+ &neigh_table_proxy_queue_class);
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
+}
+
+void neigh_table_init(struct neigh_table *tbl)
+{
+ struct neigh_table *tmp;
+
+ neigh_table_init_no_netlink(tbl);
write_lock(&neigh_tbl_lock);
+ for (tmp = neigh_tables; tmp; tmp = tmp->next) {
+ if (tmp->family == tbl->family)
+ break;
+ }
tbl->next = neigh_tables;
neigh_tables = tbl;
write_unlock(&neigh_tbl_lock);
+
+ if (unlikely(tmp)) {
+ pr_err("Registering multiple tables for family %d\n",
+ tbl->family);
+ dump_stack();
+ }
}
+EXPORT_SYMBOL(neigh_table_init);
int neigh_table_clear(struct neigh_table *tbl)
{
struct neigh_table **tp;
/* It is not clean... Fix it to unload IPv6 module safely */
- del_timer_sync(&tbl->gc_timer);
+ cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
- printk(KERN_CRIT "neighbour leakage\n");
+ pr_crit("neighbour leakage\n");
write_lock(&neigh_tbl_lock);
for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
if (*tp == tbl) {
@@ -1407,216 +1618,274 @@ int neigh_table_clear(struct neigh_table *tbl)
}
write_unlock(&neigh_tbl_lock);
- neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
- tbl->hash_buckets = NULL;
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
+ tbl->nht = NULL;
kfree(tbl->phash_buckets);
tbl->phash_buckets = NULL;
+ remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
+ free_percpu(tbl->stats);
+ tbl->stats = NULL;
+
return 0;
}
+EXPORT_SYMBOL(neigh_table_clear);
-int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct ndmsg *ndm = NLMSG_DATA(nlh);
- struct rtattr **nda = arg;
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *dst_attr;
struct neigh_table *tbl;
struct net_device *dev = NULL;
- int err = -ENODEV;
+ int err = -EINVAL;
- if (ndm->ndm_ifindex &&
- (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
+ ASSERT_RTNL();
+ if (nlmsg_len(nlh) < sizeof(*ndm))
goto out;
+ dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
+ if (dst_attr == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- struct rtattr *dst_attr = nda[NDA_DST - 1];
- struct neighbour *n;
+ struct neighbour *neigh;
if (tbl->family != ndm->ndm_family)
continue;
read_unlock(&neigh_tbl_lock);
- err = -EINVAL;
- if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len)
- goto out_dev_put;
+ if (nla_len(dst_attr) < tbl->key_len)
+ goto out;
if (ndm->ndm_flags & NTF_PROXY) {
- err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev);
- goto out_dev_put;
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
}
- if (!dev)
+ if (dev == NULL)
goto out;
- n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev);
- if (n) {
- err = neigh_update(n, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_ADMIN);
- neigh_release(n);
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
+ goto out;
}
- goto out_dev_put;
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ goto out;
}
read_unlock(&neigh_tbl_lock);
- err = -EADDRNOTAVAIL;
-out_dev_put:
- if (dev)
- dev_put(dev);
+ err = -EAFNOSUPPORT;
+
out:
return err;
}
-int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct ndmsg *ndm = NLMSG_DATA(nlh);
- struct rtattr **nda = arg;
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
- int err = -ENODEV;
+ int err;
+
+ ASSERT_RTNL();
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ goto out;
- if (ndm->ndm_ifindex &&
- (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
+ err = -EINVAL;
+ if (tb[NDA_DST] == NULL)
goto out;
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ goto out;
+ }
+
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
- struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1];
- struct rtattr *dst_attr = nda[NDA_DST - 1];
- int override = 1;
- struct neighbour *n;
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
if (tbl->family != ndm->ndm_family)
continue;
read_unlock(&neigh_tbl_lock);
- err = -EINVAL;
- if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len)
- goto out_dev_put;
+ if (nla_len(tb[NDA_DST]) < tbl->key_len)
+ goto out;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
err = -ENOBUFS;
- if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1))
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
err = 0;
- goto out_dev_put;
+ }
+ goto out;
}
- err = -EINVAL;
- if (!dev)
+ if (dev == NULL)
goto out;
- if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len)
- goto out_dev_put;
-
- n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev);
- if (n) {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
- err = -EEXIST;
- neigh_release(n);
- goto out_dev_put;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
}
-
- override = nlh->nlmsg_flags & NLM_F_REPLACE;
- } else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- err = -ENOENT;
- goto out_dev_put;
} else {
- n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev);
- if (IS_ERR(n)) {
- err = PTR_ERR(n);
- goto out_dev_put;
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
+ goto out;
}
- }
- err = neigh_update(n,
- lladdr_attr ? RTA_DATA(lladdr_attr) : NULL,
- ndm->ndm_state,
- (override ? NEIGH_UPDATE_F_OVERRIDE : 0) |
- NEIGH_UPDATE_F_ADMIN);
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ }
- neigh_release(n);
- goto out_dev_put;
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ neigh_release(neigh);
+ goto out;
}
read_unlock(&neigh_tbl_lock);
- err = -EADDRNOTAVAIL;
-out_dev_put:
- if (dev)
- dev_put(dev);
+ err = -EAFNOSUPPORT;
out:
return err;
}
static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
- struct rtattr *nest = NULL;
-
- nest = RTA_NEST(skb, NDTA_PARMS);
+ struct nlattr *nest;
- if (parms->dev)
- RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
-
- RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
- RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
- RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
- RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
- RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
- RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
- RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
- RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
- parms->base_reachable_time);
- RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
- RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
- RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
- RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
- RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
- RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
-
- return RTA_NEST_END(skb, nest);
-
-rtattr_failure:
- return RTA_NEST_CANCEL(skb, nest);
-}
+ nest = nla_nest_start(skb, NDTA_PARMS);
+ if (nest == NULL)
+ return -ENOBUFS;
-static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ if ((parms->dev &&
+ nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ nla_put_u32(skb, NDTPA_QUEUE_LEN,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
+ nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
+ nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
+ nla_put_u32(skb, NDTPA_UCAST_PROBES,
+ NEIGH_VAR(parms, UCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_PROBES,
+ NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
+ nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_GC_STALETIME,
+ NEIGH_VAR(parms, GC_STALETIME)) ||
+ nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
+ NEIGH_VAR(parms, DELAY_PROBE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_RETRANS_TIME,
+ NEIGH_VAR(parms, RETRANS_TIME)) ||
+ nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
+ NEIGH_VAR(parms, ANYCAST_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_PROXY_DELAY,
+ NEIGH_VAR(parms, PROXY_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_LOCKTIME,
+ NEIGH_VAR(parms, LOCKTIME)))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
+ u32 pid, u32 seq, int type, int flags)
{
struct nlmsghdr *nlh;
struct ndtmsg *ndtmsg;
- nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
- NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
- ndtmsg = NLMSG_DATA(nlh);
+ ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
- RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
- RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
- RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
- RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
- RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
-
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
+ nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
+ nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
+ nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ goto nla_put_failure;
{
unsigned long now = jiffies;
unsigned int flush_delta = now - tbl->last_flush;
unsigned int rand_delta = now - tbl->last_rand;
-
+ struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
.ndtc_entry_size = tbl->entry_size,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
- .ndtc_hash_rnd = tbl->hash_rnd,
- .ndtc_hash_mask = tbl->hash_mask,
- .ndtc_hash_chain_gc = tbl->hash_chain_gc,
.ndtc_proxy_qlen = tbl->proxy_queue.qlen,
};
- RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
+ ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
+ rcu_read_unlock_bh();
+
+ if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
+ goto nla_put_failure;
}
{
@@ -1625,7 +1894,7 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
memset(&ndst, 0, sizeof(ndst));
- for_each_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
@@ -1641,312 +1910,477 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
}
- RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+ if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
+ goto nla_put_failure;
}
BUG_ON(tbl->parms.dev);
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
- goto rtattr_failure;
+ goto nla_put_failure;
read_unlock_bh(&tbl->lock);
- return NLMSG_END(skb, nlh);
+ return nlmsg_end(skb, nlh);
-rtattr_failure:
+nla_put_failure:
read_unlock_bh(&tbl->lock);
- return NLMSG_CANCEL(skb, nlh);
-
-nlmsg_failure:
- return -1;
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-static int neightbl_fill_param_info(struct neigh_table *tbl,
+static int neightbl_fill_param_info(struct sk_buff *skb,
+ struct neigh_table *tbl,
struct neigh_parms *parms,
- struct sk_buff *skb,
- struct netlink_callback *cb)
+ u32 pid, u32 seq, int type,
+ unsigned int flags)
{
struct ndtmsg *ndtmsg;
struct nlmsghdr *nlh;
- nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
- NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
- ndtmsg = NLMSG_DATA(nlh);
+ ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
- RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
- if (neightbl_fill_parms(skb, parms) < 0)
- goto rtattr_failure;
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
+ neightbl_fill_parms(skb, parms) < 0)
+ goto errout;
read_unlock_bh(&tbl->lock);
- return NLMSG_END(skb, nlh);
-
-rtattr_failure:
+ return nlmsg_end(skb, nlh);
+errout:
read_unlock_bh(&tbl->lock);
- return NLMSG_CANCEL(skb, nlh);
-
-nlmsg_failure:
- return -1;
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-
-static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
- int ifindex)
-{
- struct neigh_parms *p;
-
- for (p = &tbl->parms; p; p = p->next)
- if ((p->dev && p->dev->ifindex == ifindex) ||
- (!p->dev && !ifindex))
- return p;
- return NULL;
-}
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
+ [NDTA_NAME] = { .type = NLA_STRING },
+ [NDTA_THRESH1] = { .type = NLA_U32 },
+ [NDTA_THRESH2] = { .type = NLA_U32 },
+ [NDTA_THRESH3] = { .type = NLA_U32 },
+ [NDTA_GC_INTERVAL] = { .type = NLA_U64 },
+ [NDTA_PARMS] = { .type = NLA_NESTED },
+};
-int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
+ [NDTPA_IFINDEX] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
+ [NDTPA_APP_PROBES] = { .type = NLA_U32 },
+ [NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
+ [NDTPA_GC_STALETIME] = { .type = NLA_U64 },
+ [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
+ [NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
+ [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
+ [NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
+ [NDTPA_LOCKTIME] = { .type = NLA_U64 },
+};
+
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ struct net *net = sock_net(skb->sk);
struct neigh_table *tbl;
- struct ndtmsg *ndtmsg = NLMSG_DATA(nlh);
- struct rtattr **tb = arg;
- int err = -EINVAL;
+ struct ndtmsg *ndtmsg;
+ struct nlattr *tb[NDTA_MAX+1];
+ int err;
- if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1]))
- return -EINVAL;
+ err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[NDTA_NAME] == NULL) {
+ err = -EINVAL;
+ goto errout;
+ }
+ ndtmsg = nlmsg_data(nlh);
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
- if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
break;
}
if (tbl == NULL) {
err = -ENOENT;
- goto errout;
+ goto errout_locked;
}
- /*
+ /*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
write_lock_bh(&tbl->lock);
- if (tb[NDTA_THRESH1 - 1])
- tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]);
-
- if (tb[NDTA_THRESH2 - 1])
- tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]);
-
- if (tb[NDTA_THRESH3 - 1])
- tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]);
-
- if (tb[NDTA_GC_INTERVAL - 1])
- tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]);
-
- if (tb[NDTA_PARMS - 1]) {
- struct rtattr *tbp[NDTPA_MAX];
+ if (tb[NDTA_PARMS]) {
+ struct nlattr *tbp[NDTPA_MAX+1];
struct neigh_parms *p;
- u32 ifindex = 0;
+ int i, ifindex = 0;
- if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0)
- goto rtattr_failure;
+ err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
+ nl_ntbl_parm_policy);
+ if (err < 0)
+ goto errout_tbl_lock;
- if (tbp[NDTPA_IFINDEX - 1])
- ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]);
+ if (tbp[NDTPA_IFINDEX])
+ ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
- p = lookup_neigh_params(tbl, ifindex);
+ p = lookup_neigh_parms(tbl, net, ifindex);
if (p == NULL) {
err = -ENOENT;
- goto rtattr_failure;
+ goto errout_tbl_lock;
}
-
- if (tbp[NDTPA_QUEUE_LEN - 1])
- p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]);
-
- if (tbp[NDTPA_PROXY_QLEN - 1])
- p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]);
-
- if (tbp[NDTPA_APP_PROBES - 1])
- p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]);
- if (tbp[NDTPA_UCAST_PROBES - 1])
- p->ucast_probes =
- RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]);
-
- if (tbp[NDTPA_MCAST_PROBES - 1])
- p->mcast_probes =
- RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]);
-
- if (tbp[NDTPA_BASE_REACHABLE_TIME - 1])
- p->base_reachable_time =
- RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]);
+ for (i = 1; i <= NDTPA_MAX; i++) {
+ if (tbp[i] == NULL)
+ continue;
- if (tbp[NDTPA_GC_STALETIME - 1])
- p->gc_staletime =
- RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]);
+ switch (i) {
+ case NDTPA_QUEUE_LEN:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN));
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_PROXY_QLEN:
+ NEIGH_VAR_SET(p, PROXY_QLEN,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_APP_PROBES:
+ NEIGH_VAR_SET(p, APP_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_UCAST_PROBES:
+ NEIGH_VAR_SET(p, UCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_PROBES:
+ NEIGH_VAR_SET(p, MCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_BASE_REACHABLE_TIME:
+ NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_GC_STALETIME:
+ NEIGH_VAR_SET(p, GC_STALETIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_DELAY_PROBE_TIME:
+ NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_RETRANS_TIME:
+ NEIGH_VAR_SET(p, RETRANS_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_ANYCAST_DELAY:
+ NEIGH_VAR_SET(p, ANYCAST_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_PROXY_DELAY:
+ NEIGH_VAR_SET(p, PROXY_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_LOCKTIME:
+ NEIGH_VAR_SET(p, LOCKTIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ }
+ }
+ }
- if (tbp[NDTPA_DELAY_PROBE_TIME - 1])
- p->delay_probe_time =
- RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]);
+ err = -ENOENT;
+ if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+ tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+ !net_eq(net, &init_net))
+ goto errout_tbl_lock;
- if (tbp[NDTPA_RETRANS_TIME - 1])
- p->retrans_time =
- RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]);
+ if (tb[NDTA_THRESH1])
+ tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
- if (tbp[NDTPA_ANYCAST_DELAY - 1])
- p->anycast_delay =
- RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]);
+ if (tb[NDTA_THRESH2])
+ tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
- if (tbp[NDTPA_PROXY_DELAY - 1])
- p->proxy_delay =
- RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]);
+ if (tb[NDTA_THRESH3])
+ tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
- if (tbp[NDTPA_LOCKTIME - 1])
- p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]);
- }
+ if (tb[NDTA_GC_INTERVAL])
+ tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
err = 0;
-rtattr_failure:
+errout_tbl_lock:
write_unlock_bh(&tbl->lock);
-errout:
+errout_locked:
read_unlock(&neigh_tbl_lock);
+errout:
return err;
}
-int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx, family;
- int s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family, tidx, nidx = 0;
+ int tbl_skip = cb->args[0];
+ int neigh_skip = cb->args[1];
struct neigh_table *tbl;
- family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) {
+ for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
struct neigh_parms *p;
- if (idx < s_idx || (family && tbl->family != family))
+ if (tidx < tbl_skip || (family && tbl->family != family))
continue;
- if (neightbl_fill_info(tbl, skb, cb) <= 0)
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) <= 0)
break;
- for (++idx, p = tbl->parms.next; p; p = p->next, idx++) {
- if (idx < s_idx)
+ for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
+ if (!net_eq(neigh_parms_net(p), net))
continue;
- if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0)
+ if (nidx < neigh_skip)
+ goto next;
+
+ if (neightbl_fill_param_info(skb, tbl, p,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) <= 0)
goto out;
+ next:
+ nidx++;
}
+ neigh_skip = 0;
}
out:
read_unlock(&neigh_tbl_lock);
- cb->args[0] = idx;
+ cb->args[0] = tidx;
+ cb->args[1] = nidx;
return skb->len;
}
-static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
- u32 pid, u32 seq, int event, unsigned int flags)
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
{
unsigned long now = jiffies;
- unsigned char *b = skb->tail;
struct nda_cacheinfo ci;
- int locked = 0;
- u32 probes;
- struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event,
- sizeof(struct ndmsg), flags);
- struct ndmsg *ndm = NLMSG_DATA(nlh);
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
- ndm->ndm_family = n->ops->family;
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = n->flags;
- ndm->ndm_type = n->type;
- ndm->ndm_ifindex = n->dev->ifindex;
- RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key);
- read_lock_bh(&n->lock);
- locked = 1;
- ndm->ndm_state = n->nud_state;
- if (n->nud_state & NUD_VALID)
- RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha);
- ci.ndm_used = now - n->used;
- ci.ndm_confirmed = now - n->confirmed;
- ci.ndm_updated = now - n->updated;
- ci.ndm_refcnt = atomic_read(&n->refcnt) - 1;
- probes = atomic_read(&n->probes);
- read_unlock_bh(&n->lock);
- locked = 0;
- RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
- RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes);
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
+ ndm->ndm_flags = neigh->flags;
+ ndm->ndm_type = neigh->type;
+ ndm->ndm_ifindex = neigh->dev->ifindex;
+
+ if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
+ goto nla_put_failure;
+
+ read_lock_bh(&neigh->lock);
+ ndm->ndm_state = neigh->nud_state;
+ if (neigh->nud_state & NUD_VALID) {
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, neigh, neigh->dev);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
+ }
+
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
+ ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ read_unlock_bh(&neigh->lock);
+
+ if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
+ nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+ u32 pid, u32 seq, int type, unsigned int flags,
+ struct neigh_table *tbl)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
-nlmsg_failure:
-rtattr_failure:
- if (locked)
- read_unlock_bh(&n->lock);
- skb_trim(skb, b - skb->data);
- return -1;
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = tbl->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_type = RTN_UNICAST;
+ ndm->ndm_ifindex = pn->dev->ifindex;
+ ndm->ndm_state = NUD_NONE;
+
+ if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
+static void neigh_update_notify(struct neighbour *neigh)
+{
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0);
+}
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
+ struct neigh_hash_table *nht;
- for (h = 0; h <= tbl->hash_mask; h++) {
- if (h < s_h)
- continue;
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) {
- if (idx < s_idx)
+ for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (!net_eq(dev_net(n->dev), net))
continue;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ if (idx < s_idx)
+ goto next;
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) <= 0) {
- read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
}
+next:
+ idx++;
}
- read_unlock_bh(&tbl->lock);
}
rc = skb->len;
out:
+ rcu_read_unlock_bh();
cb->args[1] = h;
cb->args[2] = idx;
return rc;
}
-int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct pneigh_entry *n;
+ struct net *net = sock_net(skb->sk);
+ int rc, h, s_h = cb->args[3];
+ int idx, s_idx = idx = cb->args[4];
+
+ read_lock_bh(&tbl->lock);
+
+ for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+ if (dev_net(n->dev) != net)
+ continue;
+ if (idx < s_idx)
+ goto next;
+ if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI, tbl) <= 0) {
+ read_unlock_bh(&tbl->lock);
+ rc = -1;
+ goto out;
+ }
+ next:
+ idx++;
+ }
+ }
+
+ read_unlock_bh(&tbl->lock);
+ rc = skb->len;
+out:
+ cb->args[3] = h;
+ cb->args[4] = idx;
+ return rc;
+
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct neigh_table *tbl;
int t, family, s_t;
+ int proxy = 0;
+ int err;
read_lock(&neigh_tbl_lock);
- family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ /* check for full ndmsg structure presence, family member is
+ * the same for both structures
+ */
+ if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ proxy = 1;
+
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+ for (tbl = neigh_tables, t = 0; tbl;
+ tbl = tbl->next, t++) {
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
- if (neigh_dump_table(tbl, skb, cb) < 0)
+ if (proxy)
+ err = pneigh_dump_table(tbl, skb, cb);
+ else
+ err = neigh_dump_table(tbl, skb, cb);
+ if (err < 0)
break;
}
read_unlock(&neigh_tbl_lock);
@@ -1958,15 +2392,22 @@ int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
+ struct neigh_hash_table *nht;
- read_lock_bh(&tbl->lock);
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ read_lock(&tbl->lock); /* avoid resizes */
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = tbl->hash_buckets[chain]; n; n = n->next)
+ for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next))
cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ read_unlock(&tbl->lock);
+ rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -1975,24 +2416,31 @@ void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
int chain;
+ struct neigh_hash_table *nht;
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
- struct neighbour *n, **np;
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
- np = &tbl->hash_buckets[chain];
- while ((n = *np) != NULL) {
+ np = &nht->hash_buckets[chain];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
n->dead = 1;
} else
np = &n->next;
write_unlock(&n->lock);
if (release)
- neigh_release(n);
+ neigh_cleanup_and_release(n);
}
}
}
@@ -2003,15 +2451,18 @@ EXPORT_SYMBOL(__neigh_for_each_release);
static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
int bucket = state->bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
- n = tbl->hash_buckets[bucket];
+ for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
+ n = rcu_dereference_bh(nht->hash_buckets[bucket]);
while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
@@ -2024,8 +2475,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
break;
if (n->nud_state & ~NUD_NOARP)
break;
- next:
- n = n->next;
+next:
+ n = rcu_dereference_bh(n->next);
}
if (n)
@@ -2041,17 +2492,20 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
}
- n = n->next;
+ n = rcu_dereference_bh(n->next);
while (1) {
while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
@@ -2063,17 +2517,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
if (n->nud_state & ~NUD_NOARP)
break;
- next:
- n = n->next;
+next:
+ n = rcu_dereference_bh(n->next);
}
if (n)
break;
- if (++state->bucket > tbl->hash_mask)
+ if (++state->bucket >= (1 << nht->hash_shift))
break;
- n = tbl->hash_buckets[state->bucket];
+ n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
}
if (n && pos)
@@ -2086,6 +2540,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
struct neighbour *n = neigh_get_first(seq);
if (n) {
+ --(*pos);
while (*pos) {
n = neigh_get_next(seq, n, pos);
if (!n)
@@ -2098,6 +2553,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
int bucket = state->bucket;
@@ -2105,6 +2561,8 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
pn = tbl->phash_buckets[bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
if (pn)
break;
}
@@ -2118,13 +2576,19 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
- pn = pn->next;
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
pn = tbl->phash_buckets[state->bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
if (pn)
break;
}
@@ -2140,6 +2604,7 @@ static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
struct pneigh_entry *pn = pneigh_get_first(seq);
if (pn) {
+ --(*pos);
while (*pos) {
pn = pneigh_get_next(seq, pn, pos);
if (!pn)
@@ -2153,27 +2618,28 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
void *rc;
+ loff_t idxpos = *pos;
- rc = neigh_get_idx(seq, pos);
+ rc = neigh_get_idx(seq, &idxpos);
if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
- rc = pneigh_get_idx(seq, pos);
+ rc = pneigh_get_idx(seq, &idxpos);
return rc;
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+ __acquires(rcu_bh)
{
struct neigh_seq_state *state = seq->private;
- loff_t pos_minus_one;
state->tbl = tbl;
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock_bh();
+ state->nht = rcu_dereference_bh(tbl->nht);
- pos_minus_one = *pos - 1;
- return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
+ return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(neigh_seq_start);
@@ -2183,7 +2649,7 @@ void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
void *rc;
if (v == SEQ_START_TOKEN) {
- rc = neigh_get_idx(seq, pos);
+ rc = neigh_get_first(seq);
goto out;
}
@@ -2205,11 +2671,9 @@ out:
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu_bh)
{
- struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
-
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_seq_stop);
@@ -2217,14 +2681,13 @@ EXPORT_SYMBOL(neigh_seq_stop);
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = seq->private;
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
-
- for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
@@ -2235,11 +2698,10 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = seq->private;
int cpu;
- for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
@@ -2255,17 +2717,16 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
- struct proc_dir_entry *pde = seq->private;
- struct neigh_table *tbl = pde->data;
+ struct neigh_table *tbl = seq->private;
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs\n");
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx\n",
+ "%08lx %08lx %08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -2281,13 +2742,14 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
st->rcv_probes_ucast,
st->periodic_gc_runs,
- st->forced_gc_runs
+ st->forced_gc_runs,
+ st->unres_discards
);
return 0;
}
-static struct seq_operations neigh_stat_seq_ops = {
+static const struct seq_operations neigh_stat_seq_ops = {
.start = neigh_stat_seq_start,
.next = neigh_stat_seq_next,
.stop = neigh_stat_seq_stop,
@@ -2300,12 +2762,12 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- sf->private = PDE(inode);
+ sf->private = PDE_DATA(inode);
}
return ret;
};
-static struct file_operations neigh_stat_seq_fops = {
+static const struct file_operations neigh_stat_seq_fops = {
.owner = THIS_MODULE,
.open = neigh_stat_seq_open,
.read = seq_read,
@@ -2315,357 +2777,364 @@ static struct file_operations neigh_stat_seq_fops = {
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_ARPD
-void neigh_app_ns(struct neighbour *n)
+static inline size_t neigh_nlmsg_size(void)
{
- struct nlmsghdr *nlh;
- int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256);
- struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC);
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4); /* NDA_PROBES */
+}
- if (!skb)
- return;
+static void __neigh_notify(struct neighbour *n, int type, int flags)
+{
+ struct net *net = dev_net(n->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
- if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) {
+ err = neigh_fill_info(skb, n, 0, 0, type, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
- return;
+ goto errout;
}
- nlh = (struct nlmsghdr *)skb->data;
- nlh->nlmsg_flags = NLM_F_REQUEST;
- NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-static void neigh_app_notify(struct neighbour *n)
+void neigh_app_ns(struct neighbour *n)
{
- struct nlmsghdr *nlh;
- int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256);
- struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC);
+ __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
+}
+EXPORT_SYMBOL(neigh_app_ns);
- if (!skb)
- return;
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int int_max = INT_MAX;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
- if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) {
- kfree_skb(skb);
- return;
+static int proc_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ struct ctl_table tmp = *ctl;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &unres_qlen_max;
+ tmp.data = &size;
+
+ size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
}
- nlh = (struct nlmsghdr *)skb->data;
- NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
+ return NULL;
}
-#endif /* CONFIG_ARPD */
+static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
+ int index)
+{
+ struct net_device *dev;
+ int family = neigh_parms_family(p);
-#ifdef CONFIG_SYSCTL
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct neigh_parms *dst_p =
+ neigh_get_dev_parms_rcu(dev, family);
+
+ if (dst_p && !test_bit(index, dst_p->data_state))
+ dst_p->data[index] = p->data[index];
+ }
+ rcu_read_unlock();
+}
+
+static void neigh_proc_update(struct ctl_table *ctl, int write)
+{
+ struct net_device *dev = ctl->extra1;
+ struct neigh_parms *p = ctl->extra2;
+ struct net *net = neigh_parms_net(p);
+ int index = (int *) ctl->data - p->data;
+
+ if (!write)
+ return;
+
+ set_bit(index, p->data_state);
+ if (!dev) /* NULL dev means this is default value */
+ neigh_copy_dflt_parms(net, p, index);
+}
+
+static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &int_max;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec);
+
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
+
+static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
+
+static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+#define NEIGH_PARMS_DATA_OFFSET(index) \
+ (&((struct neigh_parms *) 0)->data[index])
+
+#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
+ [NEIGH_VAR_ ## attr] = { \
+ .procname = name, \
+ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ }
+
+#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
+
+#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
+
+#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- ctl_table neigh_vars[__NET_NEIGH_MAX];
- ctl_table neigh_dev[2];
- ctl_table neigh_neigh_dir[2];
- ctl_table neigh_proto_dir[2];
- ctl_table neigh_root_dir[2];
-} neigh_sysctl_template = {
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
- {
- .ctl_name = NET_NEIGH_MCAST_SOLICIT,
- .procname = "mcast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_UCAST_SOLICIT,
- .procname = "ucast_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_APP_SOLICIT,
- .procname = "app_solicit",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_RETRANS_TIME,
- .procname = "retrans_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_REACHABLE_TIME,
- .procname = "base_reachable_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_DELAY_PROBE_TIME,
- .procname = "delay_first_probe_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_GC_STALE_TIME,
- .procname = "gc_stale_time",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_UNRES_QLEN,
- .procname = "unres_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_PROXY_QLEN,
- .procname = "proxy_qlen",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_ANYCAST_DELAY,
- .procname = "anycast_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_PROXY_DELAY,
- .procname = "proxy_delay",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_LOCKTIME,
- .procname = "locktime",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_GC_INTERVAL,
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
+ NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
+ [NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH1,
+ [NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH2,
+ [NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
- {
- .ctl_name = NET_NEIGH_GC_THRESH3,
+ [NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_NEIGH_RETRANS_TIME_MS,
- .procname = "retrans_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
- },
- {
- .ctl_name = NET_NEIGH_REACHABLE_TIME_MS,
- .procname = "base_reachable_time_ms",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_ms_jiffies,
- .strategy = &sysctl_ms_jiffies,
- },
- },
- .neigh_dev = {
- {
- .ctl_name = NET_PROTO_CONF_DEFAULT,
- .procname = "default",
- .mode = 0555,
- },
- },
- .neigh_neigh_dir = {
- {
- .procname = "neigh",
- .mode = 0555,
- },
- },
- .neigh_proto_dir = {
- {
- .mode = 0555,
- },
- },
- .neigh_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
},
+ {},
},
};
int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
- int p_id, int pdev_id, char *p_name,
- proc_handler *handler, ctl_handler *strategy)
+ proc_handler *handler)
{
- struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
- const char *dev_name_source = NULL;
- char *dev_name = NULL;
- int err = 0;
+ int i;
+ struct neigh_sysctl_table *t;
+ const char *dev_name_source;
+ char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
+ char *p_name;
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
if (!t)
- return -ENOBUFS;
- memcpy(t, &neigh_sysctl_template, sizeof(*t));
- t->neigh_vars[0].data = &p->mcast_probes;
- t->neigh_vars[1].data = &p->ucast_probes;
- t->neigh_vars[2].data = &p->app_probes;
- t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->base_reachable_time;
- t->neigh_vars[5].data = &p->delay_probe_time;
- t->neigh_vars[6].data = &p->gc_staletime;
- t->neigh_vars[7].data = &p->queue_len;
- t->neigh_vars[8].data = &p->proxy_qlen;
- t->neigh_vars[9].data = &p->anycast_delay;
- t->neigh_vars[10].data = &p->proxy_delay;
- t->neigh_vars[11].data = &p->locktime;
+ goto err;
+
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
+ t->neigh_vars[i].data += (long) p;
+ t->neigh_vars[i].extra1 = dev;
+ t->neigh_vars[i].extra2 = p;
+ }
if (dev) {
dev_name_source = dev->name;
- t->neigh_dev[0].ctl_name = dev->ifindex;
- t->neigh_vars[12].procname = NULL;
- t->neigh_vars[13].procname = NULL;
- t->neigh_vars[14].procname = NULL;
- t->neigh_vars[15].procname = NULL;
+ /* Terminate the table early */
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
- dev_name_source = t->neigh_dev[0].procname;
- t->neigh_vars[12].data = (int *)(p + 1);
- t->neigh_vars[13].data = (int *)(p + 1) + 1;
- t->neigh_vars[14].data = (int *)(p + 1) + 2;
- t->neigh_vars[15].data = (int *)(p + 1) + 3;
+ struct neigh_table *tbl = p->tbl;
+ dev_name_source = "default";
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
- t->neigh_vars[16].data = &p->retrans_time;
- t->neigh_vars[17].data = &p->base_reachable_time;
-
- if (handler || strategy) {
+ if (handler) {
/* RetransTime */
- t->neigh_vars[3].proc_handler = handler;
- t->neigh_vars[3].strategy = strategy;
- t->neigh_vars[3].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
/* ReachableTime */
- t->neigh_vars[4].proc_handler = handler;
- t->neigh_vars[4].strategy = strategy;
- t->neigh_vars[4].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
/* RetransTime (in milliseconds)*/
- t->neigh_vars[16].proc_handler = handler;
- t->neigh_vars[16].strategy = strategy;
- t->neigh_vars[16].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
/* ReachableTime (in milliseconds) */
- t->neigh_vars[17].proc_handler = handler;
- t->neigh_vars[17].strategy = strategy;
- t->neigh_vars[17].extra1 = dev;
- }
-
- dev_name = kstrdup(dev_name_source, GFP_KERNEL);
- if (!dev_name) {
- err = -ENOBUFS;
- goto free;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
}
- t->neigh_dev[0].procname = dev_name;
-
- t->neigh_neigh_dir[0].ctl_name = pdev_id;
+ /* Don't export sysctls to unprivileged users */
+ if (neigh_parms_net(p)->user_ns != &init_user_ns)
+ t->neigh_vars[0].procname = NULL;
- t->neigh_proto_dir[0].procname = p_name;
- t->neigh_proto_dir[0].ctl_name = p_id;
+ switch (neigh_parms_family(p)) {
+ case AF_INET:
+ p_name = "ipv4";
+ break;
+ case AF_INET6:
+ p_name = "ipv6";
+ break;
+ default:
+ BUG();
+ }
- t->neigh_dev[0].child = t->neigh_vars;
- t->neigh_neigh_dir[0].child = t->neigh_dev;
- t->neigh_proto_dir[0].child = t->neigh_neigh_dir;
- t->neigh_root_dir[0].child = t->neigh_proto_dir;
+ snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
+ p_name, dev_name_source);
+ t->sysctl_header =
+ register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ if (!t->sysctl_header)
+ goto free;
- t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0);
- if (!t->sysctl_header) {
- err = -ENOBUFS;
- goto free_procname;
- }
p->sysctl_table = t;
return 0;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
+free:
kfree(t);
-
- return err;
+err:
+ return -ENOBUFS;
}
+EXPORT_SYMBOL(neigh_sysctl_register);
void neigh_sysctl_unregister(struct neigh_parms *p)
{
if (p->sysctl_table) {
struct neigh_sysctl_table *t = p->sysctl_table;
p->sysctl_table = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->neigh_dev[0].procname);
+ unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
-EXPORT_SYMBOL(__neigh_event_send);
-EXPORT_SYMBOL(neigh_add);
-EXPORT_SYMBOL(neigh_changeaddr);
-EXPORT_SYMBOL(neigh_compat_output);
-EXPORT_SYMBOL(neigh_connected_output);
-EXPORT_SYMBOL(neigh_create);
-EXPORT_SYMBOL(neigh_delete);
-EXPORT_SYMBOL(neigh_destroy);
-EXPORT_SYMBOL(neigh_dump_info);
-EXPORT_SYMBOL(neigh_event_ns);
-EXPORT_SYMBOL(neigh_ifdown);
-EXPORT_SYMBOL(neigh_lookup);
-EXPORT_SYMBOL(neigh_lookup_nodev);
-EXPORT_SYMBOL(neigh_parms_alloc);
-EXPORT_SYMBOL(neigh_parms_release);
-EXPORT_SYMBOL(neigh_rand_reach_time);
-EXPORT_SYMBOL(neigh_resolve_output);
-EXPORT_SYMBOL(neigh_table_clear);
-EXPORT_SYMBOL(neigh_table_init);
-EXPORT_SYMBOL(neigh_update);
-EXPORT_SYMBOL(neigh_update_hhs);
-EXPORT_SYMBOL(pneigh_enqueue);
-EXPORT_SYMBOL(pneigh_lookup);
-EXPORT_SYMBOL(neightbl_dump_info);
-EXPORT_SYMBOL(neightbl_set);
+static int __init neigh_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
+ NULL);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
+
+ return 0;
+}
+
+subsys_initcall(neigh_init);
-#ifdef CONFIG_ARPD
-EXPORT_SYMBOL(neigh_app_ns);
-#endif
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(neigh_sysctl_register);
-EXPORT_SYMBOL(neigh_sysctl_unregister);
-#endif
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
new file mode 100644
index 00000000000..2bf83299600
--- /dev/null
+++ b/net/core/net-procfs.c
@@ -0,0 +1,423 @@
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/wext.h>
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct hlist_head *h;
+ unsigned int count = 0, offset = get_offset(*pos);
+
+ h = &net->dev_name_head[get_bucket(*pos)];
+ hlist_for_each_entry_rcu(dev, h, name_hlist) {
+ if (++count == offset)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net_device *dev;
+ unsigned int bucket;
+
+ do {
+ dev = dev_from_same_bucket(seq, pos);
+ if (dev)
+ return dev;
+
+ bucket = get_bucket(*pos) + 1;
+ *pos = set_bucket_offset(bucket, 1);
+ } while (bucket < NETDEV_HASHENTRIES);
+
+ return NULL;
+}
+
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
+ return NULL;
+
+ return dev_from_bucket(seq, pos);
+}
+
+static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return dev_from_bucket(seq, pos);
+}
+
+static void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+ "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+ struct softnet_data *sd = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ sd = &per_cpu(softnet_data, *pos);
+ break;
+ } else
+ ++*pos;
+ return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct softnet_data *sd = v;
+ unsigned int flow_limit_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl)
+ flow_limit_count = fl->count;
+ rcu_read_unlock();
+#endif
+
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ sd->processed, sd->dropped, sd->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ sd->cpu_collision, sd->received_rps, flow_limit_count);
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static int dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static int softnet_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &softnet_seq_ops);
+}
+
+static const struct file_operations softnet_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = softnet_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *ptype_get_idx(loff_t pos)
+{
+ struct packet_type *pt = NULL;
+ loff_t i = 0;
+ int t;
+
+ list_for_each_entry_rcu(pt, &ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (nxt != &ptype_all)
+ goto found;
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s %pf\n",
+ pt->dev ? pt->dev->name : "", pt->func);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ptype_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ptype_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ptype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+ goto out;
+ if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+ &softnet_seq_fops))
+ goto out_dev;
+ if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ remove_proc_entry("ptype", net->proc_net);
+out_softnet:
+ remove_proc_entry("softnet_stat", net->proc_net);
+out_dev:
+ remove_proc_entry("dev", net->proc_net);
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ remove_proc_entry("dev", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct netdev_hw_addr *ha;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(ha, dev) {
+ int i;
+
+ seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
+ dev->name, ha->refcount, ha->global_use);
+
+ for (i = 0; i < dev->addr_len; i++)
+ seq_printf(seq, "%02x", ha->addr[i]);
+
+ seq_putc(seq, '\n');
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int dev_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ remove_proc_entry("dev_mcast", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+int __init dev_proc_init(void)
+{
+ int ret = register_pernet_subsys(&dev_proc_ops);
+ if (!ret)
+ return register_pernet_subsys(&dev_mc_net_ops);
+ return ret;
+}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e8b2acbc8ea..1cac29ebb05 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -2,7 +2,7 @@
* net-sysfs.c - network device class and attributes
*
* Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
- *
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
@@ -10,33 +10,40 @@
*/
#include <linux/capability.h>
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
#include <net/sock.h>
+#include <net/net_namespace.h>
#include <linux/rtnetlink.h>
-#include <linux/wireless.h>
-#include <net/iw_handler.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
-#define to_class_dev(obj) container_of(obj,struct class_device,kobj)
-#define to_net_dev(class) container_of(class, struct net_device, class_dev)
+#include "net-sysfs.h"
+#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_udec[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
-static inline int dev_isalive(const struct net_device *dev)
+static inline int dev_isalive(const struct net_device *dev)
{
- return dev->reg_state == NETREG_REGISTERED;
+ return dev->reg_state <= NETREG_REGISTERED;
}
/* use same locking rules as GIF* ioctl's */
-static ssize_t netdev_show(const struct class_device *cd, char *buf,
+static ssize_t netdev_show(const struct device *dev,
+ struct device_attribute *attr, char *buf,
ssize_t (*format)(const struct net_device *, char *))
{
- struct net_device *net = to_net_dev(cd);
+ struct net_device *net = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
@@ -53,32 +60,42 @@ static ssize_t format_##field(const struct net_device *net, char *buf) \
{ \
return sprintf(buf, format_string, net->field); \
} \
-static ssize_t show_##field(struct class_device *cd, char *buf) \
+static ssize_t field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
- return netdev_show(cd, buf, format_##field); \
-}
+ return netdev_show(dev, attr, buf, format_##field); \
+} \
+
+#define NETDEVICE_SHOW_RO(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RO(field)
+#define NETDEVICE_SHOW_RW(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RW(field)
/* use same locking and permission rules as SIF* ioctl's */
-static ssize_t netdev_store(struct class_device *dev,
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len,
int (*set)(struct net_device *, unsigned long))
{
- struct net_device *net = to_net_dev(dev);
- char *endp;
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
unsigned long new;
int ret = -EINVAL;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- new = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
goto err;
- rtnl_lock();
- if (dev_isalive(net)) {
- if ((ret = (*set)(net, new)) == 0)
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ if ((ret = (*set)(netdev, new)) == 0)
ret = len;
}
rtnl_unlock();
@@ -86,45 +103,55 @@ static ssize_t netdev_store(struct class_device *dev,
return ret;
}
-NETDEVICE_SHOW(addr_len, fmt_dec);
-NETDEVICE_SHOW(iflink, fmt_dec);
-NETDEVICE_SHOW(ifindex, fmt_dec);
-NETDEVICE_SHOW(features, fmt_long_hex);
-NETDEVICE_SHOW(type, fmt_dec);
+NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
+NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW_RO(addr_len, fmt_dec);
+NETDEVICE_SHOW_RO(iflink, fmt_dec);
+NETDEVICE_SHOW_RO(ifindex, fmt_dec);
+NETDEVICE_SHOW_RO(type, fmt_dec);
+NETDEVICE_SHOW_RO(link_mode, fmt_dec);
/* use same locking rules as GIFHWADDR ioctl's */
-static ssize_t format_addr(char *buf, const unsigned char *addr, int len)
-{
- int i;
- char *cp = buf;
-
- for (i = 0; i < len; i++)
- cp += sprintf(cp, "%02x%c", addr[i],
- i == (len - 1) ? '\n' : ':');
- return cp - buf;
-}
-
-static ssize_t show_address(struct class_device *dev, char *buf)
+static ssize_t address_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct net_device *net = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
if (dev_isalive(net))
- ret = format_addr(buf, net->dev_addr, net->addr_len);
+ ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
read_unlock(&dev_base_lock);
return ret;
}
+static DEVICE_ATTR_RO(address);
-static ssize_t show_broadcast(struct class_device *dev, char *buf)
+static ssize_t broadcast_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct net_device *net = to_net_dev(dev);
if (dev_isalive(net))
- return format_addr(buf, net->broadcast, net->addr_len);
+ return sysfs_format_mac(buf, net->broadcast, net->addr_len);
return -EINVAL;
}
+static DEVICE_ATTR_RO(broadcast);
+
+static int change_carrier(struct net_device *net, unsigned long new_carrier)
+{
+ if (!netif_running(net))
+ return -EINVAL;
+ return dev_change_carrier(net, (bool) new_carrier);
+}
+
+static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_carrier);
+}
-static ssize_t show_carrier(struct class_device *dev, char *buf)
+static ssize_t carrier_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev)) {
@@ -132,33 +159,135 @@ static ssize_t show_carrier(struct class_device *dev, char *buf)
}
return -EINVAL;
}
+static DEVICE_ATTR_RW(carrier);
+
+static ssize_t speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd))
+ ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(speed);
+
+static ssize_t duplex_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd)) {
+ const char *duplex;
+ switch (cmd.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sprintf(buf, "%s\n", duplex);
+ }
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(duplex);
+
+static ssize_t dormant_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(dormant);
+
+static const char *const operstates[] = {
+ "unknown",
+ "notpresent", /* currently unused */
+ "down",
+ "lowerlayerdown",
+ "testing", /* currently unused */
+ "dormant",
+ "up"
+};
+
+static ssize_t operstate_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ unsigned char operstate;
+
+ read_lock(&dev_base_lock);
+ operstate = netdev->operstate;
+ if (!netif_running(netdev))
+ operstate = IF_OPER_DOWN;
+ read_unlock(&dev_base_lock);
+
+ if (operstate >= ARRAY_SIZE(operstates))
+ return -EINVAL; /* should not happen */
+
+ return sprintf(buf, "%s\n", operstates[operstate]);
+}
+static DEVICE_ATTR_RO(operstate);
+
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_changes));
+}
+static DEVICE_ATTR_RO(carrier_changes);
/* read-write attributes */
-NETDEVICE_SHOW(mtu, fmt_dec);
static int change_mtu(struct net_device *net, unsigned long new_mtu)
{
return dev_set_mtu(net, (int) new_mtu);
}
-static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len)
+static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_mtu);
+ return netdev_store(dev, attr, buf, len, change_mtu);
}
-
-NETDEVICE_SHOW(flags, fmt_hex);
+NETDEVICE_SHOW_RW(mtu, fmt_dec);
static int change_flags(struct net_device *net, unsigned long new_flags)
{
- return dev_change_flags(net, (unsigned) new_flags);
+ return dev_change_flags(net, (unsigned int) new_flags);
}
-static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len)
+static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_flags);
+ return netdev_store(dev, attr, buf, len, change_flags);
}
-
-NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
+NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
{
@@ -166,71 +295,149 @@ static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
return 0;
}
-static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len)
+static ssize_t tx_queue_len_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_tx_queue_len);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_tx_queue_len);
}
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
+
+static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ size_t count = len;
+ ssize_t ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
-NETDEVICE_SHOW(weight, fmt_dec);
+ /* ignore trailing newline */
+ if (len > 0 && buf[len - 1] == '\n')
+ --count;
-static int change_weight(struct net_device *net, unsigned long new_weight)
+ if (!rtnl_trylock())
+ return restart_syscall();
+ ret = dev_set_alias(netdev, buf, count);
+ rtnl_unlock();
+
+ return ret < 0 ? ret : len;
+}
+
+static ssize_t ifalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = 0;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+ if (netdev->ifalias)
+ ret = sprintf(buf, "%s\n", netdev->ifalias);
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RW(ifalias);
+
+static int change_group(struct net_device *net, unsigned long new_group)
{
- net->weight = new_weight;
+ dev_set_group(net, (int) new_group);
return 0;
}
-static ssize_t store_weight(struct class_device *dev, const char *buf, size_t len)
+static ssize_t group_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
{
- return netdev_store(dev, buf, len, change_weight);
+ return netdev_store(dev, attr, buf, len, change_group);
}
+NETDEVICE_SHOW(group, fmt_dec);
+static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
-static struct class_device_attribute net_class_attributes[] = {
- __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
- __ATTR(iflink, S_IRUGO, show_iflink, NULL),
- __ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
- __ATTR(features, S_IRUGO, show_features, NULL),
- __ATTR(type, S_IRUGO, show_type, NULL),
- __ATTR(address, S_IRUGO, show_address, NULL),
- __ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
- __ATTR(carrier, S_IRUGO, show_carrier, NULL),
- __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
- __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
- __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
- store_tx_queue_len),
- __ATTR(weight, S_IRUGO | S_IWUSR, show_weight, store_weight),
- {}
+static ssize_t phys_port_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_port_id ppid;
+
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_id);
+
+static struct attribute *net_class_attrs[] = {
+ &dev_attr_netdev_group.attr,
+ &dev_attr_type.attr,
+ &dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
+ &dev_attr_iflink.attr,
+ &dev_attr_ifindex.attr,
+ &dev_attr_addr_assign_type.attr,
+ &dev_attr_addr_len.attr,
+ &dev_attr_link_mode.attr,
+ &dev_attr_address.attr,
+ &dev_attr_broadcast.attr,
+ &dev_attr_speed.attr,
+ &dev_attr_duplex.attr,
+ &dev_attr_dormant.attr,
+ &dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
+ &dev_attr_ifalias.attr,
+ &dev_attr_carrier.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_tx_queue_len.attr,
+ &dev_attr_phys_port_id.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(net_class);
/* Show a given an attribute in the statistics group */
-static ssize_t netstat_show(const struct class_device *cd, char *buf,
+static ssize_t netstat_show(const struct device *d,
+ struct device_attribute *attr, char *buf,
unsigned long offset)
{
- struct net_device *dev = to_net_dev(cd);
- struct net_device_stats *stats;
+ struct net_device *dev = to_net_dev(d);
ssize_t ret = -EINVAL;
- if (offset > sizeof(struct net_device_stats) ||
- offset % sizeof(unsigned long) != 0)
- WARN_ON(1);
+ WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+ offset % sizeof(u64) != 0);
read_lock(&dev_base_lock);
- if (dev_isalive(dev) && dev->get_stats &&
- (stats = (*dev->get_stats)(dev)))
- ret = sprintf(buf, fmt_ulong,
- *(unsigned long *)(((u8 *) stats) + offset));
+ if (dev_isalive(dev)) {
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+ ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
+ }
read_unlock(&dev_base_lock);
return ret;
}
/* generate a read-only statistics attribute */
#define NETSTAT_ENTRY(name) \
-static ssize_t show_##name(struct class_device *cd, char *buf) \
+static ssize_t name##_show(struct device *d, \
+ struct device_attribute *attr, char *buf) \
{ \
- return netstat_show(cd, buf, \
- offsetof(struct net_device_stats, name)); \
+ return netstat_show(d, attr, buf, \
+ offsetof(struct rtnl_link_stats64, name)); \
} \
-static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+static DEVICE_ATTR_RO(name)
NETSTAT_ENTRY(rx_packets);
NETSTAT_ENTRY(tx_packets);
@@ -257,29 +464,29 @@ NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
static struct attribute *netstat_attrs[] = {
- &class_device_attr_rx_packets.attr,
- &class_device_attr_tx_packets.attr,
- &class_device_attr_rx_bytes.attr,
- &class_device_attr_tx_bytes.attr,
- &class_device_attr_rx_errors.attr,
- &class_device_attr_tx_errors.attr,
- &class_device_attr_rx_dropped.attr,
- &class_device_attr_tx_dropped.attr,
- &class_device_attr_multicast.attr,
- &class_device_attr_collisions.attr,
- &class_device_attr_rx_length_errors.attr,
- &class_device_attr_rx_over_errors.attr,
- &class_device_attr_rx_crc_errors.attr,
- &class_device_attr_rx_frame_errors.attr,
- &class_device_attr_rx_fifo_errors.attr,
- &class_device_attr_rx_missed_errors.attr,
- &class_device_attr_tx_aborted_errors.attr,
- &class_device_attr_tx_carrier_errors.attr,
- &class_device_attr_tx_fifo_errors.attr,
- &class_device_attr_tx_heartbeat_errors.attr,
- &class_device_attr_tx_window_errors.attr,
- &class_device_attr_rx_compressed.attr,
- &class_device_attr_tx_compressed.attr,
+ &dev_attr_rx_packets.attr,
+ &dev_attr_tx_packets.attr,
+ &dev_attr_rx_bytes.attr,
+ &dev_attr_tx_bytes.attr,
+ &dev_attr_rx_errors.attr,
+ &dev_attr_tx_errors.attr,
+ &dev_attr_rx_dropped.attr,
+ &dev_attr_tx_dropped.attr,
+ &dev_attr_multicast.attr,
+ &dev_attr_collisions.attr,
+ &dev_attr_rx_length_errors.attr,
+ &dev_attr_rx_over_errors.attr,
+ &dev_attr_rx_crc_errors.attr,
+ &dev_attr_rx_frame_errors.attr,
+ &dev_attr_rx_fifo_errors.attr,
+ &dev_attr_rx_missed_errors.attr,
+ &dev_attr_tx_aborted_errors.attr,
+ &dev_attr_tx_carrier_errors.attr,
+ &dev_attr_tx_fifo_errors.attr,
+ &dev_attr_tx_heartbeat_errors.attr,
+ &dev_attr_tx_window_errors.attr,
+ &dev_attr_rx_compressed.attr,
+ &dev_attr_tx_compressed.attr,
NULL
};
@@ -289,176 +496,914 @@ static struct attribute_group netstat_group = {
.attrs = netstat_attrs,
};
-#ifdef WIRELESS_EXT
-/* helper function that does all the locking etc for wireless stats */
-static ssize_t wireless_show(struct class_device *cd, char *buf,
- ssize_t (*format)(const struct iw_statistics *,
- char *))
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+static struct attribute *wireless_attrs[] = {
+ NULL
+};
+
+static struct attribute_group wireless_group = {
+ .name = "wireless",
+ .attrs = wireless_attrs,
+};
+#endif
+
+#else /* CONFIG_SYSFS */
+#define net_class_groups NULL
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_SYSFS
+#define to_rx_queue_attr(_attr) container_of(_attr, \
+ struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
{
- struct net_device *dev = to_net_dev(cd);
- const struct iw_statistics *iw = NULL;
- ssize_t ret = -EINVAL;
-
- read_lock(&dev_base_lock);
- if (dev_isalive(dev)) {
- if(dev->wireless_handlers &&
- dev->wireless_handlers->get_wireless_stats)
- iw = dev->wireless_handlers->get_wireless_stats(dev);
- else if (dev->get_wireless_stats)
- iw = dev->get_wireless_stats(dev);
- if (iw != NULL)
- ret = (*format)(iw, buf);
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops rx_queue_sysfs_ops = {
+ .show = rx_queue_attr_show,
+ .store = rx_queue_attr_store,
+};
+
+#ifdef CONFIG_RPS
+static ssize_t show_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute, char *buf)
+{
+ struct rps_map *map;
+ cpumask_var_t mask;
+ size_t len = 0;
+ int i;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map)
+ for (i = 0; i < map->len; i++)
+ cpumask_set_cpu(map->cpus[i], mask);
+
+ len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+ if (PAGE_SIZE - len < 3) {
+ rcu_read_unlock();
+ free_cpumask_var(mask);
+ return -EINVAL;
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
- return ret;
+ free_cpumask_var(mask);
+ len += sprintf(buf + len, "\n");
+ return len;
}
-/* show function template for wireless fields */
-#define WIRELESS_SHOW(name, field, format_string) \
-static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct rps_map *old_map, *map;
+ cpumask_var_t mask;
+ int err, cpu, i;
+ static DEFINE_SPINLOCK(rps_map_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ map = kzalloc(max_t(unsigned int,
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (!map) {
+ free_cpumask_var(mask);
+ return -ENOMEM;
+ }
+
+ i = 0;
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ map->cpus[i++] = cpu;
+
+ if (i)
+ map->len = i;
+ else {
+ kfree(map);
+ map = NULL;
+ }
+
+ spin_lock(&rps_map_lock);
+ old_map = rcu_dereference_protected(queue->rps_map,
+ lockdep_is_held(&rps_map_lock));
+ rcu_assign_pointer(queue->rps_map, map);
+ spin_unlock(&rps_map_lock);
+
+ if (map)
+ static_key_slow_inc(&rps_needed);
+ if (old_map) {
+ kfree_rcu(old_map, rcu);
+ static_key_slow_dec(&rps_needed);
+ }
+ free_cpumask_var(mask);
+ return len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned long val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = (unsigned long)flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%lu\n", val);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned long mask, count;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
+
+ if (count) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check
+ * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
+ } else
+ table = NULL;
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = rcu_dereference_protected(queue->rps_flow_table,
+ lockdep_is_held(&rps_dev_flow_lock));
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute =
+ __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+ __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
+
+static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
+ &rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
+#endif
+ NULL
+};
+
+static void rx_queue_release(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+
+
+ map = rcu_dereference_protected(queue->rps_map, 1);
+ if (map) {
+ RCU_INIT_POINTER(queue->rps_map, NULL);
+ kfree_rcu(map, rcu);
+ }
+
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
+ if (flow_table) {
+ RCU_INIT_POINTER(queue->rps_flow_table, NULL);
+ call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
+ }
+#endif
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *rx_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static struct kobj_type rx_queue_ktype = {
+ .sysfs_ops = &rx_queue_sysfs_ops,
+ .release = rx_queue_release,
+ .default_attrs = rx_queue_default_attrs,
+ .namespace = rx_queue_namespace
+};
+
+static int rx_queue_add_kobject(struct net_device *net, int index)
+{
+ struct netdev_rx_queue *queue = net->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ kobj->kset = net->queues_kset;
+ error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+ "rx-%u", index);
+ if (error)
+ goto exit;
+
+ if (net->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+ if (error)
+ goto exit;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ dev_hold(queue->dev);
+
+ return error;
+exit:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+#ifndef CONFIG_RPS
+ if (!net->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = old_num; i < new_num; i++) {
+ error = rx_queue_add_kobject(net, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ if (net->sysfs_rx_queue_group)
+ sysfs_remove_group(&net->_rx[i].kobj,
+ net->sysfs_rx_queue_group);
+ kobject_put(&net->_rx[i].kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr, char *buf);
+ ssize_t (*store)(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr, \
+ struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+ .show = netdev_queue_attr_show,
+ .store = netdev_queue_attr_store,
+};
+
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ value = DQL_MAX_LIMIT;
+ else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute =
+ __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
+ bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute =
+ __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ char *buf) \
{ \
- return sprintf(buf, format_string, iw->field); \
+ return bql_show(buf, queue->dql.FIELD); \
} \
-static ssize_t show_iw_##name(struct class_device *cd, char *buf) \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ const char *buf, size_t len) \
{ \
- return wireless_show(cd, buf, format_iw_##name); \
+ return bql_set(buf, len, &queue->dql.FIELD); \
} \
-static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
-
-WIRELESS_SHOW(status, status, fmt_hex);
-WIRELESS_SHOW(link, qual.qual, fmt_dec);
-WIRELESS_SHOW(level, qual.level, fmt_dec);
-WIRELESS_SHOW(noise, qual.noise, fmt_dec);
-WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
-WIRELESS_SHOW(crypt, discard.code, fmt_dec);
-WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
-WIRELESS_SHOW(misc, discard.misc, fmt_dec);
-WIRELESS_SHOW(retries, discard.retries, fmt_dec);
-WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
+ __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
+ bql_set_ ## NAME);
+
+BQL_ATTR(limit, limit)
+BQL_ATTR(limit_max, max_limit)
+BQL_ATTR(limit_min, min_limit)
+
+static struct attribute *dql_attrs[] = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
-static struct attribute *wireless_attrs[] = {
- &class_device_attr_status.attr,
- &class_device_attr_link.attr,
- &class_device_attr_level.attr,
- &class_device_attr_noise.attr,
- &class_device_attr_nwid.attr,
- &class_device_attr_crypt.attr,
- &class_device_attr_fragment.attr,
- &class_device_attr_retries.attr,
- &class_device_attr_misc.attr,
- &class_device_attr_beacon.attr,
+static struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int i;
+
+ i = queue - dev->_tx;
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+
+static ssize_t show_xps_map(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ cpumask_var_t mask;
+ unsigned long index;
+ size_t len = 0;
+ int i;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ for_each_possible_cpu(i) {
+ struct xps_map *map =
+ rcu_dereference(dev_maps->cpu_map[i]);
+ if (map) {
+ int j;
+ for (j = 0; j < map->len; j++) {
+ if (map->queues[j] == index) {
+ cpumask_set_cpu(i, mask);
+ break;
+ }
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+ if (PAGE_SIZE - len < 3) {
+ free_cpumask_var(mask);
+ return -EINVAL;
+ }
+
+ free_cpumask_var(mask);
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ unsigned long index;
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ err = netif_set_xps_queue(dev, mask, index);
+
+ free_cpumask_var(mask);
+
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+ __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
+
+static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
+ &xps_cpus_attribute.attr,
+#endif
NULL
};
-static struct attribute_group wireless_group = {
- .name = "wireless",
- .attrs = wireless_attrs,
+static void netdev_queue_release(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *netdev_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static struct kobj_type netdev_queue_ktype = {
+ .sysfs_ops = &netdev_queue_sysfs_ops,
+ .release = netdev_queue_release,
+ .default_attrs = netdev_queue_default_attrs,
+ .namespace = netdev_queue_namespace,
};
+
+static int netdev_queue_add_kobject(struct net_device *net, int index)
+{
+ struct netdev_queue *queue = net->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ kobj->kset = net->queues_kset;
+ error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+ "tx-%u", index);
+ if (error)
+ goto exit;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto exit;
#endif
-#ifdef CONFIG_HOTPLUG
-static int netdev_uevent(struct class_device *cd, char **envp,
- int num_envp, char *buf, int size)
+ kobject_uevent(kobj, KOBJ_ADD);
+ dev_hold(queue->dev);
+
+ return 0;
+exit:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
- struct net_device *dev = to_net_dev(cd);
- int i = 0;
- int n;
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+ for (i = old_num; i < new_num; i++) {
+ error = netdev_queue_add_kobject(net, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
- /* pass interface to uevent. */
- envp[i++] = buf;
- n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
- buf += n;
- size -= n;
+ while (--i >= new_num) {
+ struct netdev_queue *queue = net->_tx + i;
+
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
- if ((size <= 0) || (i >= num_envp))
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
+static int register_queue_kobjects(struct net_device *net)
+{
+ int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ net->queues_kset = kset_create_and_add("queues",
+ NULL, &net->dev.kobj);
+ if (!net->queues_kset)
return -ENOMEM;
+ real_rx = net->real_num_rx_queues;
+#endif
+ real_tx = net->real_num_tx_queues;
+
+ error = net_rx_queue_update_kobjects(net, 0, real_rx);
+ if (error)
+ goto error;
+ rxq = real_rx;
+
+ error = netdev_queue_update_kobjects(net, 0, real_tx);
+ if (error)
+ goto error;
+ txq = real_tx;
- envp[i] = NULL;
return 0;
+
+error:
+ netdev_queue_update_kobjects(net, txq, 0);
+ net_rx_queue_update_kobjects(net, rxq, 0);
+ return error;
+}
+
+static void remove_queue_kobjects(struct net_device *net)
+{
+ int real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ real_rx = net->real_num_rx_queues;
+#endif
+ real_tx = net->real_num_tx_queues;
+
+ net_rx_queue_update_kobjects(net, real_rx, 0);
+ netdev_queue_update_kobjects(net, real_tx, 0);
+#ifdef CONFIG_SYSFS
+ kset_unregister(net->queues_kset);
+#endif
+}
+
+static bool net_current_may_mount(void)
+{
+ struct net *net = current->nsproxy->net_ns;
+
+ return ns_capable(net->user_ns, CAP_SYS_ADMIN);
}
+
+static void *net_grab_current_ns(void)
+{
+ struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+ if (ns)
+ atomic_inc(&ns->passive);
#endif
+ return ns;
+}
+
+static const void *net_initial_ns(void)
+{
+ return &init_net;
+}
+
+static const void *net_netlink_ns(struct sock *sk)
+{
+ return sock_net(sk);
+}
+
+struct kobj_ns_type_operations net_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_NET,
+ .current_may_mount = net_current_may_mount,
+ .grab_current_ns = net_grab_current_ns,
+ .netlink_ns = net_netlink_ns,
+ .initial_ns = net_initial_ns,
+ .drop_ns = net_drop_ns,
+};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
+
+static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+{
+ struct net_device *dev = to_net_dev(d);
+ int retval;
+
+ /* pass interface to uevent. */
+ retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+ if (retval)
+ goto exit;
+
+ /* pass ifindex to uevent.
+ * ifindex is useful as it won't change (interface name may change)
+ * and is what RtNetlink uses natively. */
+ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+ return retval;
+}
/*
- * netdev_release -- destroy and free a dead device.
- * Called when last reference to class_device kobject is gone.
+ * netdev_release -- destroy and free a dead device.
+ * Called when last reference to device kobject is gone.
*/
-static void netdev_release(struct class_device *cd)
+static void netdev_release(struct device *d)
{
- struct net_device *dev
- = container_of(cd, struct net_device, class_dev);
+ struct net_device *dev = to_net_dev(d);
BUG_ON(dev->reg_state != NETREG_RELEASED);
- kfree((char *)dev - dev->padded);
+ kfree(dev->ifalias);
+ netdev_freemem(dev);
+}
+
+static const void *net_namespace(struct device *d)
+{
+ struct net_device *dev;
+ dev = container_of(d, struct net_device, dev);
+ return dev_net(dev);
}
static struct class net_class = {
.name = "net",
- .release = netdev_release,
- .class_dev_attrs = net_class_attributes,
-#ifdef CONFIG_HOTPLUG
- .uevent = netdev_uevent,
-#endif
+ .dev_release = netdev_release,
+ .dev_groups = net_class_groups,
+ .dev_uevent = netdev_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
};
-void netdev_unregister_sysfs(struct net_device * net)
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device * net)
{
- struct class_device * class_dev = &(net->class_dev);
+ struct device *dev = &(net->dev);
- if (net->get_stats)
- sysfs_remove_group(&class_dev->kobj, &netstat_group);
+ kobject_get(&dev->kobj);
-#ifdef WIRELESS_EXT
- if (net->get_wireless_stats || (net->wireless_handlers &&
- net->wireless_handlers->get_wireless_stats))
- sysfs_remove_group(&class_dev->kobj, &wireless_group);
-#endif
- class_device_del(class_dev);
+ remove_queue_kobjects(net);
+
+ pm_runtime_set_memalloc_noio(dev, false);
+ device_del(dev);
}
/* Create sysfs entries for network device. */
-int netdev_register_sysfs(struct net_device *net)
+int netdev_register_kobject(struct net_device *net)
{
- struct class_device *class_dev = &(net->class_dev);
- int ret;
+ struct device *dev = &(net->dev);
+ const struct attribute_group **groups = net->sysfs_groups;
+ int error = 0;
+
+ device_initialize(dev);
+ dev->class = &net_class;
+ dev->platform_data = net;
+ dev->groups = groups;
+
+ dev_set_name(dev, "%s", net->name);
+
+#ifdef CONFIG_SYSFS
+ /* Allow for a device specific group */
+ if (*groups)
+ groups++;
+
+ *groups++ = &netstat_group;
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+ if (net->ieee80211_ptr)
+ *groups++ = &wireless_group;
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ else if (net->wireless_handlers)
+ *groups++ = &wireless_group;
+#endif
+#endif
+#endif /* CONFIG_SYSFS */
- class_dev->class = &net_class;
- class_dev->class_data = net;
+ error = device_add(dev);
+ if (error)
+ return error;
- strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE);
- if ((ret = class_device_register(class_dev)))
- goto out;
+ error = register_queue_kobjects(net);
+ if (error) {
+ device_del(dev);
+ return error;
+ }
- if (net->get_stats &&
- (ret = sysfs_create_group(&class_dev->kobj, &netstat_group)))
- goto out_unreg;
+ pm_runtime_set_memalloc_noio(dev, true);
-#ifdef WIRELESS_EXT
- if (net->get_wireless_stats || (net->wireless_handlers &&
- net->wireless_handlers->get_wireless_stats)) {
- ret = sysfs_create_group(&class_dev->kobj, &wireless_group);
- if (ret)
- goto out_cleanup;
- }
- return 0;
-out_cleanup:
- if (net->get_stats)
- sysfs_remove_group(&class_dev->kobj, &netstat_group);
-#else
- return 0;
-#endif
+ return error;
+}
-out_unreg:
- printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n",
- net->name, ret);
- class_device_unregister(class_dev);
-out:
- return ret;
+int netdev_class_create_file_ns(struct class_attribute *class_attr,
+ const void *ns)
+{
+ return class_create_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_create_file_ns);
+
+void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+ const void *ns)
+{
+ class_remove_file_ns(&net_class, class_attr, ns);
}
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
-int netdev_sysfs_init(void)
+int __init netdev_kobject_init(void)
{
+ kobj_ns_type_register(&net_ns_type_operations);
return class_register(&net_class);
}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 00000000000..2745a1b51e0
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,11 @@
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int __init netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+ int old_num, int new_num);
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 00000000000..ba3c0120786
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,37 @@
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/slab.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/events/net.h>
+#include <trace/events/napi.h>
+#include <trace/events/sock.h>
+#include <trace/events/udp.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 00000000000..85b62691f4f
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,676 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/rculist.h>
+#include <linux/nsproxy.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ * Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+DEFINE_MUTEX(net_mutex);
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+struct net init_net = {
+ .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+};
+EXPORT_SYMBOL(init_net);
+
+#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
+
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+ struct net_generic *ng;
+ size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->len = max_gen_ptrs;
+
+ return ng;
+}
+
+static int net_assign_generic(struct net *net, int id, void *data)
+{
+ struct net_generic *ng, *old_ng;
+
+ BUG_ON(!mutex_is_locked(&net_mutex));
+ BUG_ON(id == 0);
+
+ old_ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&net_mutex));
+ ng = old_ng;
+ if (old_ng->len >= id)
+ goto assign;
+
+ ng = net_alloc_generic();
+ if (ng == NULL)
+ return -ENOMEM;
+
+ /*
+ * Some synchronisation notes:
+ *
+ * The net_generic explores the net->gen array inside rcu
+ * read section. Besides once set the net->gen->ptr[x]
+ * pointer never changes (see rules in netns/generic.h).
+ *
+ * That said, we simply duplicate this array and schedule
+ * the old copy for kfree after a grace period.
+ */
+
+ memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+
+ rcu_assign_pointer(net->gen, ng);
+ kfree_rcu(old_ng, rcu);
+assign:
+ ng->ptr[id - 1] = data;
+ return 0;
+}
+
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+ int err = -ENOMEM;
+ void *data = NULL;
+
+ if (ops->id && ops->size) {
+ data = kzalloc(ops->size, GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ err = net_assign_generic(net, *ops->id, data);
+ if (err)
+ goto cleanup;
+ }
+ err = 0;
+ if (ops->init)
+ err = ops->init(net);
+ if (!err)
+ return 0;
+
+cleanup:
+ kfree(data);
+
+out:
+ return err;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+ if (ops->id && ops->size) {
+ int id = *ops->id;
+ kfree(net_generic(net, id));
+ }
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->exit(net);
+ }
+ if (ops->exit_batch)
+ ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->size && ops->id) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops_free(ops, net);
+ }
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+{
+ /* Must be called with net_mutex held */
+ const struct pernet_operations *ops, *saved_ops;
+ int error = 0;
+ LIST_HEAD(net_exit_list);
+
+ atomic_set(&net->count, 1);
+ atomic_set(&net->passive, 1);
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+
+#ifdef NETNS_REFCNT_DEBUG
+ atomic_set(&net->use_count, 0);
+#endif
+
+ list_for_each_entry(ops, &pernet_list, list) {
+ error = ops_init(ops, net);
+ if (error < 0)
+ goto out_undo;
+ }
+out:
+ return error;
+
+out_undo:
+ /* Walk through the list backwards calling the exit functions
+ * for the pernet modules whose init functions did not fail.
+ */
+ list_add(&net->exit_list, &net_exit_list);
+ saved_ops = ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ rcu_barrier();
+ goto out;
+}
+
+
+#ifdef CONFIG_NET_NS
+static struct kmem_cache *net_cachep;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+ struct net *net = NULL;
+ struct net_generic *ng;
+
+ ng = net_alloc_generic();
+ if (!ng)
+ goto out;
+
+ net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ if (!net)
+ goto out_free;
+
+ rcu_assign_pointer(net->gen, ng);
+out:
+ return net;
+
+out_free:
+ kfree(ng);
+ goto out;
+}
+
+static void net_free(struct net *net)
+{
+#ifdef NETNS_REFCNT_DEBUG
+ if (unlikely(atomic_read(&net->use_count) != 0)) {
+ pr_emerg("network namespace not free! Usage: %d\n",
+ atomic_read(&net->use_count));
+ return;
+ }
+#endif
+ kfree(net->gen);
+ kmem_cache_free(net_cachep, net);
+}
+
+void net_drop_ns(void *p)
+{
+ struct net *ns = p;
+ if (ns && atomic_dec_and_test(&ns->passive))
+ net_free(ns);
+}
+
+struct net *copy_net_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct net *old_net)
+{
+ struct net *net;
+ int rv;
+
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+
+ net = net_alloc();
+ if (!net)
+ return ERR_PTR(-ENOMEM);
+
+ get_user_ns(user_ns);
+
+ mutex_lock(&net_mutex);
+ rv = setup_net(net, user_ns);
+ if (rv == 0) {
+ rtnl_lock();
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ rtnl_unlock();
+ }
+ mutex_unlock(&net_mutex);
+ if (rv < 0) {
+ put_user_ns(user_ns);
+ net_drop_ns(net);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
+
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
+
+static void cleanup_net(struct work_struct *work)
+{
+ const struct pernet_operations *ops;
+ struct net *net, *tmp;
+ struct list_head net_kill_list;
+ LIST_HEAD(net_exit_list);
+
+ /* Atomically snapshot the list of namespaces to cleanup */
+ spin_lock_irq(&cleanup_list_lock);
+ list_replace_init(&cleanup_list, &net_kill_list);
+ spin_unlock_irq(&cleanup_list_lock);
+
+ mutex_lock(&net_mutex);
+
+ /* Don't let anyone else find us. */
+ rtnl_lock();
+ list_for_each_entry(net, &net_kill_list, cleanup_list) {
+ list_del_rcu(&net->list);
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+ rtnl_unlock();
+
+ /*
+ * Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so
+ * the rcu_barrier() below isn't sufficient alone.
+ */
+ synchronize_rcu();
+
+ /* Run all of the network namespace exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ /* Free the net generic variables */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ mutex_unlock(&net_mutex);
+
+ /* Ensure there are no outstanding rcu callbacks using this
+ * network namespace.
+ */
+ rcu_barrier();
+
+ /* Finally it is safe to free my network namespace structure */
+ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+ list_del_init(&net->exit_list);
+ put_user_ns(net->user_ns);
+ net_drop_ns(net);
+ }
+}
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
+
+void __put_net(struct net *net)
+{
+ /* Cleanup the network namespace in process context */
+ unsigned long flags;
+
+ spin_lock_irqsave(&cleanup_list_lock, flags);
+ list_add(&net->cleanup_list, &cleanup_list);
+ spin_unlock_irqrestore(&cleanup_list_lock, flags);
+
+ queue_work(netns_wq, &net_cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+struct net *get_net_ns_by_fd(int fd)
+{
+ struct proc_ns *ei;
+ struct file *file;
+ struct net *net;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ ei = get_proc_ns(file_inode(file));
+ if (ei->ns_ops == &netns_operations)
+ net = get_net(ei->ns);
+ else
+ net = ERR_PTR(-EINVAL);
+
+ fput(file);
+ return net;
+}
+
+#else
+struct net *get_net_ns_by_fd(int fd)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif
+
+struct net *get_net_ns_by_pid(pid_t pid)
+{
+ struct task_struct *tsk;
+ struct net *net;
+
+ /* Lookup the network namespace */
+ net = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk) {
+ struct nsproxy *nsproxy;
+ nsproxy = task_nsproxy(tsk);
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ }
+ rcu_read_unlock();
+ return net;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+
+static __net_init int net_ns_net_init(struct net *net)
+{
+ return proc_alloc_inum(&net->proc_inum);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+ proc_free_inum(net->proc_inum);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+ .init = net_ns_net_init,
+ .exit = net_ns_net_exit,
+};
+
+static int __init net_ns_init(void)
+{
+ struct net_generic *ng;
+
+#ifdef CONFIG_NET_NS
+ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+ SMP_CACHE_BYTES,
+ SLAB_PANIC, NULL);
+
+ /* Create workqueue for cleanup */
+ netns_wq = create_singlethread_workqueue("netns");
+ if (!netns_wq)
+ panic("Could not create netns workq");
+#endif
+
+ ng = net_alloc_generic();
+ if (!ng)
+ panic("Could not allocate generic netns");
+
+ rcu_assign_pointer(init_net.gen, ng);
+
+ mutex_lock(&net_mutex);
+ if (setup_net(&init_net, &init_user_ns))
+ panic("Could not setup the initial network namespace");
+
+ rtnl_lock();
+ list_add_tail_rcu(&init_net.list, &net_namespace_list);
+ rtnl_unlock();
+
+ mutex_unlock(&net_mutex);
+
+ register_pernet_subsys(&net_ns_ops);
+
+ return 0;
+}
+
+pure_initcall(net_ns_init);
+
+#ifdef CONFIG_NET_NS
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ struct net *net;
+ int error;
+ LIST_HEAD(net_exit_list);
+
+ list_add_tail(&ops->list, list);
+ if (ops->init || (ops->id && ops->size)) {
+ for_each_net(net) {
+ error = ops_init(ops, net);
+ if (error)
+ goto out_undo;
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+ }
+ return 0;
+
+out_undo:
+ /* If I have an error cleanup all namespaces I initialized */
+ list_del(&ops->list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ return error;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ struct net *net;
+ LIST_HEAD(net_exit_list);
+
+ list_del(&ops->list);
+ for_each_net(net)
+ list_add_tail(&net->exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+}
+
+#else
+
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ return ops_init(ops, &init_net);
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ int error;
+
+ if (ops->id) {
+again:
+ error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ if (error < 0) {
+ if (error == -EAGAIN) {
+ ida_pre_get(&net_generic_ids, GFP_KERNEL);
+ goto again;
+ }
+ return error;
+ }
+ max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ }
+ error = __register_pernet_operations(list, ops);
+ if (error) {
+ rcu_barrier();
+ if (ops->id)
+ ida_remove(&net_generic_ids, *ops->id);
+ }
+
+ return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+
+ __unregister_pernet_operations(ops);
+ rcu_barrier();
+ if (ops->id)
+ ida_remove(&net_generic_ids, *ops->id);
+}
+
+/**
+ * register_pernet_subsys - register a network namespace subsystem
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a subsystem which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(first_device, ops);
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ * unregister_pernet_subsys - unregister a network namespace subsystem
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ unregister_pernet_operations(ops);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+/**
+ * register_pernet_device - register a network namespace device
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a device which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(&pernet_list, ops);
+ if (!error && (first_device == &pernet_list))
+ first_device = &ops->list;
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+/**
+ * unregister_pernet_device - unregister a network namespace netdevice
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static void *netns_get(struct task_struct *task)
+{
+ struct net *net = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ rcu_read_unlock();
+
+ return net;
+}
+
+static void netns_put(void *ns)
+{
+ put_net(ns);
+}
+
+static int netns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct net *net = ns;
+
+ if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ put_net(nsproxy->net_ns);
+ nsproxy->net_ns = get_net(net);
+ return 0;
+}
+
+static unsigned int netns_inum(void *ns)
+{
+ struct net *net = ns;
+ return net->proc_inum;
+}
+
+const struct proc_ns_operations netns_operations = {
+ .name = "net",
+ .type = CLONE_NEWNET,
+ .get = netns_get,
+ .put = netns_put,
+ .install = netns_install,
+ .inum = netns_inum,
+};
+#endif
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
new file mode 100644
index 00000000000..30d903b19c6
--- /dev/null
+++ b/net/core/netclassid_cgroup.c
@@ -0,0 +1,111 @@
+/*
+ * net/core/netclassid_cgroup.c Classid Cgroupfs Handling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fdtable.h>
+#include <net/cls_cgroup.h>
+#include <net/sock.h>
+
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
+}
+
+struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
+}
+EXPORT_SYMBOL_GPL(task_cls_state);
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_cls_state *cs;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ return &cs->css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
+
+ if (parent)
+ cs->classid = parent->classid;
+
+ return 0;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_cls_state(css));
+}
+
+static int update_classid(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+
+ if (sock)
+ sock->sk->sk_classid = (u32)(unsigned long)v;
+
+ return 0;
+}
+
+static void cgrp_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ void *v = (void *)(unsigned long)cs->classid;
+ struct task_struct *p;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_classid, v);
+ task_unlock(p);
+ }
+}
+
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css_cls_state(css)->classid;
+}
+
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
+{
+ css_cls_state(css)->classid = (u32) value;
+
+ return 0;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = read_classid,
+ .write_u64 = write_classid,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_cls_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = cgrp_attach,
+ .base_cftypes = ss_files,
+};
diff --git a/net/core/netevent.c b/net/core/netevent.c
new file mode 100644
index 00000000000..f17ccd291d3
--- /dev/null
+++ b/net/core/netevent.c
@@ -0,0 +1,70 @@
+/*
+ * Network event notifiers
+ *
+ * Authors:
+ * Tom Tucker <tom@opengridcomputing.com>
+ * Steve Wise <swise@opengridcomputing.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/netevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
+
+/**
+ * register_netevent_notifier - register a netevent notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when a netevent occurs.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ */
+int register_netevent_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
+
+/**
+ * netevent_unregister_notifier - unregister a netevent notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_neigh_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ */
+
+int unregister_netevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
+
+/**
+ * call_netevent_notifiers - call all netevent notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @v: pointer passed unmodified to notifier function
+ *
+ * Call all neighbour notifier blocks. Parameters and return value
+ * are as for notifier_call_chain().
+ */
+
+int call_netevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
+}
+EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ea51f8d02eb..e33937fb32a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -9,7 +9,10 @@
* Copyright (C) 2002 Red Hat, Inc.
*/
-#include <linux/smp_lock.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
@@ -22,9 +25,16 @@
#include <linux/delay.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/if_vlan.h>
#include <net/tcp.h>
#include <net/udp.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
+#include <trace/events/napi.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -33,89 +43,95 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
-#define MAX_RETRIES 20000
-
-static DEFINE_SPINLOCK(skb_list_lock);
-static int nr_skbs;
-static struct sk_buff *skbs;
-static DEFINE_SPINLOCK(queue_lock);
-static int queue_depth;
-static struct sk_buff *queue_head, *queue_tail;
+static struct sk_buff_head skb_pool;
-static atomic_t trapped;
+DEFINE_STATIC_SRCU(netpoll_srcu);
-#define NETPOLL_RX_ENABLED 1
-#define NETPOLL_RX_DROP 2
+#define USEC_PER_POLL 50
-#define MAX_SKB_SIZE \
- (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
- sizeof(struct iphdr) + sizeof(struct ethhdr))
+#define MAX_SKB_SIZE \
+ (sizeof(struct ethhdr) + \
+ sizeof(struct iphdr) + \
+ sizeof(struct udphdr) + \
+ MAX_UDP_CHUNK)
static void zap_completion_queue(void);
+static void netpoll_async_cleanup(struct work_struct *work);
-static void queue_process(void *p)
-{
- unsigned long flags;
- struct sk_buff *skb;
-
- while (queue_head) {
- spin_lock_irqsave(&queue_lock, flags);
-
- skb = queue_head;
- queue_head = skb->next;
- if (skb == queue_tail)
- queue_head = NULL;
-
- queue_depth--;
+static unsigned int carrier_timeout = 4;
+module_param(carrier_timeout, uint, 0644);
- spin_unlock_irqrestore(&queue_lock, flags);
+#define np_info(np, fmt, ...) \
+ pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_err(np, fmt, ...) \
+ pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_notice(np, fmt, ...) \
+ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
- dev_queue_xmit(skb);
- }
-}
-
-static DECLARE_WORK(send_queue, queue_process, NULL);
-
-void netpoll_queue(struct sk_buff *skb)
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
{
- unsigned long flags;
-
- if (queue_depth == MAX_QUEUE_DEPTH) {
- __kfree_skb(skb);
- return;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ skb->vlan_tci = 0;
}
- spin_lock_irqsave(&queue_lock, flags);
- if (!queue_head)
- queue_head = skb;
- else
- queue_tail->next = skb;
- queue_tail = skb;
- queue_depth++;
- spin_unlock_irqrestore(&queue_lock, flags);
+ status = ops->ndo_start_xmit(skb, dev);
+ if (status == NETDEV_TX_OK)
+ txq_trans_update(txq);
- schedule_work(&send_queue);
+out:
+ return status;
}
-static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, u32 saddr, u32 daddr)
+static void queue_process(struct work_struct *work)
{
- unsigned int psum;
-
- if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
- return 0;
+ struct netpoll_info *npinfo =
+ container_of(work, struct netpoll_info, tx_work.work);
+ struct sk_buff *skb;
+ unsigned long flags;
- psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ while ((skb = skb_dequeue(&npinfo->txq))) {
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
- if (skb->ip_summed == CHECKSUM_HW &&
- !(u16)csum_fold(csum_add(psum, skb->csum)))
- return 0;
+ if (!netif_device_present(dev) || !netif_running(dev)) {
+ kfree_skb(skb);
+ continue;
+ }
- skb->csum = psum;
+ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
- return __skb_checksum_complete(skb);
+ local_irq_save(flags);
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (netif_xmit_frozen_or_stopped(txq) ||
+ netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+ skb_queue_head(&npinfo->txq, skb);
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+
+ schedule_delayed_work(&npinfo->tx_work, HZ/10);
+ return;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+ }
}
/*
@@ -128,60 +144,114 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
* trylock here and interrupts are already disabled in the softirq
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
*/
-static void poll_napi(struct netpoll *np)
+static int poll_one_napi(struct napi_struct *napi, int budget)
{
- struct netpoll_info *npinfo = np->dev->npinfo;
- int budget = 16;
+ int work;
- if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
- npinfo->poll_owner != smp_processor_id() &&
- spin_trylock(&npinfo->poll_lock)) {
- npinfo->rx_flags |= NETPOLL_RX_DROP;
- atomic_inc(&trapped);
+ /* net_rx_action's ->poll() invocations and our's are
+ * synchronized by this test which is only made while
+ * holding the napi->poll_lock.
+ */
+ if (!test_bit(NAPI_STATE_SCHED, &napi->state))
+ return budget;
- np->dev->poll(np->dev, &budget);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
- atomic_dec(&trapped);
- npinfo->rx_flags &= ~NETPOLL_RX_DROP;
- spin_unlock(&npinfo->poll_lock);
+ work = napi->poll(napi, budget);
+ WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
+ trace_napi_poll(napi);
+
+ clear_bit(NAPI_STATE_NPSVC, &napi->state);
+
+ return budget - work;
+}
+
+static void poll_napi(struct net_device *dev, int budget)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner != smp_processor_id() &&
+ spin_trylock(&napi->poll_lock)) {
+ budget = poll_one_napi(napi, budget);
+ spin_unlock(&napi->poll_lock);
+ }
}
}
-void netpoll_poll(struct netpoll *np)
+static void netpoll_poll_dev(struct net_device *dev)
{
- if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
+ const struct net_device_ops *ops;
+ struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ int budget = 0;
+
+ /* Don't do any rx activity if the dev_lock mutex is held
+ * the dev_open/close paths use this to block netpoll activity
+ * while changing device state
+ */
+ if (down_trylock(&ni->dev_lock))
+ return;
+
+ if (!netif_running(dev)) {
+ up(&ni->dev_lock);
return;
+ }
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_poll_controller) {
+ up(&ni->dev_lock);
+ return;
+ }
/* Process pending work on NIC */
- np->dev->poll_controller(np->dev);
- if (np->dev->poll)
- poll_napi(np);
+ ops->ndo_poll_controller(dev);
+
+ poll_napi(dev, budget);
+
+ up(&ni->dev_lock);
zap_completion_queue();
}
+void netpoll_poll_disable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ int idx;
+ might_sleep();
+ idx = srcu_read_lock(&netpoll_srcu);
+ ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ if (ni)
+ down(&ni->dev_lock);
+ srcu_read_unlock(&netpoll_srcu, idx);
+}
+EXPORT_SYMBOL(netpoll_poll_disable);
+
+void netpoll_poll_enable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ rcu_read_lock();
+ ni = rcu_dereference(dev->npinfo);
+ if (ni)
+ up(&ni->dev_lock);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netpoll_poll_enable);
+
static void refill_skbs(void)
{
struct sk_buff *skb;
unsigned long flags;
- spin_lock_irqsave(&skb_list_lock, flags);
- while (nr_skbs < MAX_SKBS) {
+ spin_lock_irqsave(&skb_pool.lock, flags);
+ while (skb_pool.qlen < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- skb->next = skbs;
- skbs = skb;
- nr_skbs++;
+ __skb_queue_tail(&skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_list_lock, flags);
+ spin_unlock_irqrestore(&skb_pool.lock, flags);
}
static void zap_completion_queue(void)
@@ -200,48 +270,37 @@ static void zap_completion_queue(void)
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
- if(skb->destructor)
+ if (!skb_irq_freeable(skb)) {
+ atomic_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
- else
+ } else {
__kfree_skb(skb);
+ }
}
}
put_cpu_var(softnet_data);
}
-static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
+static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
{
- int once = 1, count = 0;
- unsigned long flags;
- struct sk_buff *skb = NULL;
+ int count = 0;
+ struct sk_buff *skb;
zap_completion_queue();
+ refill_skbs();
repeat:
- if (nr_skbs < MAX_SKBS)
- refill_skbs();
skb = alloc_skb(len, GFP_ATOMIC);
+ if (!skb)
+ skb = skb_dequeue(&skb_pool);
if (!skb) {
- spin_lock_irqsave(&skb_list_lock, flags);
- skb = skbs;
- if (skb) {
- skbs = skb->next;
- skb->next = NULL;
- nr_skbs--;
+ if (++count < 10) {
+ netpoll_poll_dev(np->dev);
+ goto repeat;
}
- spin_unlock_irqrestore(&skb_list_lock, flags);
- }
-
- if(!skb) {
- count++;
- if (once && (count == 1000000)) {
- printk("out of netpoll skbs!\n");
- once = 0;
- }
- netpoll_poll(np);
- goto repeat;
+ return NULL;
}
atomic_set(&skb->users, 1);
@@ -249,445 +308,394 @@ repeat:
return skb;
}
-static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static int netpoll_owner_active(struct net_device *dev)
{
- int status;
- struct netpoll_info *npinfo;
+ struct napi_struct *napi;
- if (!np || !np->dev || !netif_running(np->dev)) {
- __kfree_skb(skb);
- return;
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner == smp_processor_id())
+ return 1;
}
+ return 0;
+}
- npinfo = np->dev->npinfo;
+/* call with IRQ disabled */
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int status = NETDEV_TX_BUSY;
+ unsigned long tries;
+ /* It is up to the caller to keep npinfo alive. */
+ struct netpoll_info *npinfo;
- /* avoid recursion */
- if (npinfo->poll_owner == smp_processor_id() ||
- np->dev->xmit_lock_owner == smp_processor_id()) {
- if (np->drop)
- np->drop(skb);
- else
- __kfree_skb(skb);
+ WARN_ON_ONCE(!irqs_disabled());
+
+ npinfo = rcu_dereference_bh(np->dev->npinfo);
+ if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+ dev_kfree_skb_irq(skb);
return;
}
- do {
- npinfo->tries--;
- spin_lock(&np->dev->xmit_lock);
- np->dev->xmit_lock_owner = smp_processor_id();
+ /* don't get messages out of order, and no recursion */
+ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+ struct netdev_queue *txq;
- /*
- * network drivers do not expect to be called if the queue is
- * stopped.
- */
- if (netif_queue_stopped(np->dev)) {
- np->dev->xmit_lock_owner = -1;
- spin_unlock(&np->dev->xmit_lock);
- netpoll_poll(np);
- udelay(50);
- continue;
- }
+ txq = netdev_pick_tx(dev, skb, NULL);
- status = np->dev->hard_start_xmit(skb, np->dev);
- np->dev->xmit_lock_owner = -1;
- spin_unlock(&np->dev->xmit_lock);
+ /* try until next clock tick */
+ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+ tries > 0; --tries) {
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
- /* success */
- if(!status) {
- npinfo->tries = MAX_RETRIES; /* reset */
- return;
+ HARD_TX_UNLOCK(dev, txq);
+
+ if (status == NETDEV_TX_OK)
+ break;
+
+ }
+
+ /* tickle device maybe there is some cleanup */
+ netpoll_poll_dev(np->dev);
+
+ udelay(USEC_PER_POLL);
}
- /* transmit busy */
- netpoll_poll(np);
- udelay(50);
- } while (npinfo->tries > 0);
+ WARN_ONCE(!irqs_disabled(),
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+ dev->name, dev->netdev_ops->ndo_start_xmit);
+
+ }
+
+ if (status != NETDEV_TX_OK) {
+ skb_queue_tail(&npinfo->txq, skb);
+ schedule_delayed_work(&npinfo->tx_work,0);
+ }
}
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
- int total_len, eth_len, ip_len, udp_len;
+ int total_len, ip_len, udp_len;
struct sk_buff *skb;
struct udphdr *udph;
struct iphdr *iph;
struct ethhdr *eth;
+ static atomic_t ip_ident;
+ struct ipv6hdr *ip6h;
udp_len = len + sizeof(*udph);
- ip_len = eth_len = udp_len + sizeof(*iph);
- total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
+ if (np->ipv6)
+ ip_len = udp_len + sizeof(*ip6h);
+ else
+ ip_len = udp_len + sizeof(*iph);
- skb = find_skb(np, total_len, total_len - len);
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
+
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
if (!skb)
return;
- memcpy(skb->data, msg, len);
- skb->len += len;
+ skb_copy_to_linear_data(skb, msg, len);
+ skb_put(skb, len);
- udph = (struct udphdr *) skb_push(skb, sizeof(*udph));
+ skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
udph->source = htons(np->local_port);
udph->dest = htons(np->remote_port);
udph->len = htons(udp_len);
- udph->check = 0;
-
- iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
- /* iph->version = 4; iph->ihl = 5; */
- put_unaligned(0x45, (unsigned char *)iph);
- iph->tos = 0;
- put_unaligned(htons(ip_len), &(iph->tot_len));
- iph->id = 0;
- iph->frag_off = 0;
- iph->ttl = 64;
- iph->protocol = IPPROTO_UDP;
- iph->check = 0;
- put_unaligned(htonl(np->local_ip), &(iph->saddr));
- put_unaligned(htonl(np->remote_ip), &(iph->daddr));
- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-
- eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ if (np->ipv6) {
+ udph->check = 0;
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ put_unaligned(0x60, (unsigned char *)ip6h);
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
+ } else {
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ put_unaligned(0x45, (unsigned char *)iph);
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &(iph->tot_len));
+ iph->id = htons(atomic_inc_return(&ip_ident));
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &(iph->saddr));
+ put_unaligned(np->remote_ip.ip, &(iph->daddr));
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IP);
+ }
- eth->h_proto = htons(ETH_P_IP);
- memcpy(eth->h_source, np->local_mac, 6);
- memcpy(eth->h_dest, np->remote_mac, 6);
+ ether_addr_copy(eth->h_source, np->dev->dev_addr);
+ ether_addr_copy(eth->h_dest, np->remote_mac);
skb->dev = np->dev;
netpoll_send_skb(np, skb);
}
+EXPORT_SYMBOL(netpoll_send_udp);
-static void arp_reply(struct sk_buff *skb)
+void netpoll_print_options(struct netpoll *np)
{
- struct netpoll_info *npinfo = skb->dev->npinfo;
- struct arphdr *arp;
- unsigned char *arp_ptr;
- int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
- u32 sip, tip;
- struct sk_buff *send_skb;
- struct netpoll *np = NULL;
-
- if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
- np = npinfo->rx_np;
- if (!np)
- return;
-
- /* No arp on this interface */
- if (skb->dev->flags & IFF_NOARP)
- return;
-
- if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
- (2 * skb->dev->addr_len) +
- (2 * sizeof(u32)))))
- return;
-
- skb->h.raw = skb->nh.raw = skb->data;
- arp = skb->nh.arph;
-
- if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
- arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_op != htons(ARPOP_REQUEST))
- return;
-
- arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len;
- memcpy(&sip, arp_ptr, 4);
- arp_ptr += 4 + skb->dev->addr_len;
- memcpy(&tip, arp_ptr, 4);
-
- /* Should we ignore arp? */
- if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip))
- return;
-
- size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4);
- send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev),
- LL_RESERVED_SPACE(np->dev));
-
- if (!send_skb)
- return;
-
- send_skb->nh.raw = send_skb->data;
- arp = (struct arphdr *) skb_put(send_skb, size);
- send_skb->dev = skb->dev;
- send_skb->protocol = htons(ETH_P_ARP);
-
- /* Fill the device header for the ARP frame */
-
- if (np->dev->hard_header &&
- np->dev->hard_header(send_skb, skb->dev, ptype,
- np->remote_mac, np->local_mac,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- return;
- }
-
- /*
- * Fill out the arp protocol part.
- *
- * we only support ethernet device type,
- * which (according to RFC 1390) should always equal 1 (Ethernet).
- */
-
- arp->ar_hrd = htons(np->dev->type);
- arp->ar_pro = htons(ETH_P_IP);
- arp->ar_hln = np->dev->addr_len;
- arp->ar_pln = 4;
- arp->ar_op = htons(type);
-
- arp_ptr=(unsigned char *)(arp + 1);
- memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &tip, 4);
- arp_ptr += 4;
- memcpy(arp_ptr, np->remote_mac, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &sip, 4);
-
- netpoll_send_skb(np, send_skb);
+ np_info(np, "local port %d\n", np->local_port);
+ if (np->ipv6)
+ np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+ else
+ np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
+ np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "remote port %d\n", np->remote_port);
+ if (np->ipv6)
+ np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+ else
+ np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
+ np_info(np, "remote ethernet address %pM\n", np->remote_mac);
}
+EXPORT_SYMBOL(netpoll_print_options);
-int __netpoll_rx(struct sk_buff *skb)
+static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
{
- int proto, len, ulen;
- struct iphdr *iph;
- struct udphdr *uh;
- struct netpoll *np = skb->dev->npinfo->rx_np;
+ const char *end;
- if (!np)
- goto out;
- if (skb->dev->type != ARPHRD_ETHER)
- goto out;
-
- /* check if netpoll clients need ARP */
- if (skb->protocol == __constant_htons(ETH_P_ARP) &&
- atomic_read(&trapped)) {
- arp_reply(skb);
- return 1;
+ if (!strchr(str, ':') &&
+ in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
+ if (!*end)
+ return 0;
}
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto != ETH_P_IP)
- goto out;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto out;
- if (skb_shared(skb))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
- if (iph->ihl < 5 || iph->version != 4)
- goto out;
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto out;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto out;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < iph->ihl*4)
- goto out;
-
- if (iph->protocol != IPPROTO_UDP)
- goto out;
-
- len -= iph->ihl*4;
- uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
- ulen = ntohs(uh->len);
-
- if (ulen != len)
- goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
- goto out;
- if (np->local_ip && np->local_ip != ntohl(iph->daddr))
- goto out;
- if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
- goto out;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- goto out;
-
- np->rx_hook(np, ntohs(uh->source),
- (char *)(uh+1),
- ulen - sizeof(struct udphdr));
-
- kfree_skb(skb);
- return 1;
-
-out:
- if (atomic_read(&trapped)) {
- kfree_skb(skb);
- return 1;
+ if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!*end)
+ return 1;
+#else
+ return -1;
+#endif
}
-
- return 0;
+ return -1;
}
int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
+ int ipv6;
+ bool ipversion_set = false;
- if(*cur != '@') {
+ if (*cur != '@') {
if ((delim = strchr(cur, '@')) == NULL)
goto parse_failed;
- *delim=0;
- np->local_port=simple_strtol(cur, NULL, 10);
- cur=delim;
+ *delim = 0;
+ if (kstrtou16(cur, 10, &np->local_port))
+ goto parse_failed;
+ cur = delim;
}
cur++;
- printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port);
- if(*cur != '/') {
+ if (*cur != '/') {
+ ipversion_set = true;
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
- *delim=0;
- np->local_ip=ntohl(in_aton(cur));
- cur=delim;
-
- printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->local_ip));
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim;
}
cur++;
- if ( *cur != ',') {
+ if (*cur != ',') {
/* parse out dev name */
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
- *delim=0;
+ *delim = 0;
strlcpy(np->dev_name, cur, sizeof(np->dev_name));
- cur=delim;
+ cur = delim;
}
cur++;
- printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name);
-
- if ( *cur != '@' ) {
+ if (*cur != '@') {
/* dst port */
if ((delim = strchr(cur, '@')) == NULL)
goto parse_failed;
- *delim=0;
- np->remote_port=simple_strtol(cur, NULL, 10);
- cur=delim;
+ *delim = 0;
+ if (*cur == ' ' || *cur == '\t')
+ np_info(np, "warning: whitespace is not allowed\n");
+ if (kstrtou16(cur, 10, &np->remote_port))
+ goto parse_failed;
+ cur = delim;
}
cur++;
- printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port);
/* dst ip */
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
- *delim=0;
- np->remote_ip=ntohl(in_aton(cur));
- cur=delim+1;
-
- printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->remote_ip));
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim + 1;
- if( *cur != 0 )
- {
+ if (*cur != 0) {
/* MAC address */
- if ((delim = strchr(cur, ':')) == NULL)
+ if (!mac_pton(cur, np->remote_mac))
goto parse_failed;
- *delim=0;
- np->remote_mac[0]=simple_strtol(cur, NULL, 16);
- cur=delim+1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim=0;
- np->remote_mac[1]=simple_strtol(cur, NULL, 16);
- cur=delim+1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim=0;
- np->remote_mac[2]=simple_strtol(cur, NULL, 16);
- cur=delim+1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim=0;
- np->remote_mac[3]=simple_strtol(cur, NULL, 16);
- cur=delim+1;
- if ((delim = strchr(cur, ':')) == NULL)
- goto parse_failed;
- *delim=0;
- np->remote_mac[4]=simple_strtol(cur, NULL, 16);
- cur=delim+1;
- np->remote_mac[5]=simple_strtol(cur, NULL, 16);
}
- printk(KERN_INFO "%s: remote ethernet address "
- "%02x:%02x:%02x:%02x:%02x:%02x\n",
- np->name,
- np->remote_mac[0],
- np->remote_mac[1],
- np->remote_mac[2],
- np->remote_mac[3],
- np->remote_mac[4],
- np->remote_mac[5]);
+ netpoll_print_options(np);
return 0;
parse_failed:
- printk(KERN_INFO "%s: couldn't parse config at %s!\n",
- np->name, cur);
+ np_info(np, "couldn't parse config at '%s'!\n", cur);
return -1;
}
+EXPORT_SYMBOL(netpoll_parse_options);
+
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
+{
+ struct netpoll_info *npinfo;
+ const struct net_device_ops *ops;
+ int err;
+
+ np->dev = ndev;
+ strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
+
+ if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+ !ndev->netdev_ops->ndo_poll_controller) {
+ np_err(np, "%s doesn't support polling, aborting\n",
+ np->dev_name);
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sema_init(&npinfo->dev_lock, 1);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ atomic_set(&npinfo->refcnt, 1);
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
+ if (err)
+ goto free_npinfo;
+ }
+ } else {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ atomic_inc(&npinfo->refcnt);
+ }
+
+ npinfo->netpoll = np;
+
+ /* last thing to do is link it to the net device structure */
+ rcu_assign_pointer(ndev->npinfo, npinfo);
+
+ return 0;
+
+free_npinfo:
+ kfree(npinfo);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
struct net_device *ndev = NULL;
struct in_device *in_dev;
- struct netpoll_info *npinfo;
- unsigned long flags;
+ int err;
- if (np->dev_name)
- ndev = dev_get_by_name(np->dev_name);
+ rtnl_lock();
+ if (np->dev_name) {
+ struct net *net = current->nsproxy->net_ns;
+ ndev = __dev_get_by_name(net, np->dev_name);
+ }
if (!ndev) {
- printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
- np->name, np->dev_name);
- return -1;
+ np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ err = -ENODEV;
+ goto unlock;
}
+ dev_hold(ndev);
- np->dev = ndev;
- if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
- if (!npinfo)
- goto release;
-
- npinfo->rx_flags = 0;
- npinfo->rx_np = NULL;
- spin_lock_init(&npinfo->poll_lock);
- npinfo->poll_owner = -1;
- npinfo->tries = MAX_RETRIES;
- spin_lock_init(&npinfo->rx_lock);
- } else
- npinfo = ndev->npinfo;
-
- if (!ndev->poll_controller) {
- printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
- np->name, np->dev_name);
- goto release;
+ if (netdev_master_upper_dev_get(ndev)) {
+ np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ err = -EBUSY;
+ goto put;
}
if (!netif_running(ndev)) {
unsigned long atmost, atleast;
- printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
- np->name, np->dev_name);
+ np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
- rtnl_shlock();
- if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) {
- printk(KERN_ERR "%s: failed to open %s\n",
- np->name, np->dev_name);
- rtnl_shunlock();
- goto release;
+ err = dev_open(ndev);
+
+ if (err) {
+ np_err(np, "failed to open %s\n", ndev->name);
+ goto put;
}
- rtnl_shunlock();
+ rtnl_unlock();
atleast = jiffies + HZ/10;
- atmost = jiffies + 4*HZ;
+ atmost = jiffies + carrier_timeout * HZ;
while (!netif_carrier_ok(ndev)) {
if (time_after(jiffies, atmost)) {
- printk(KERN_NOTICE
- "%s: timeout waiting for carrier\n",
- np->name);
+ np_notice(np, "timeout waiting for carrier\n");
break;
}
- cond_resched();
+ msleep(1);
}
/* If carrier appears to come up instantly, we don't
@@ -696,96 +704,153 @@ int netpoll_setup(struct netpoll *np)
*/
if (time_before(jiffies, atleast)) {
- printk(KERN_NOTICE "%s: carrier detect appears"
- " untrustworthy, waiting 4 seconds\n",
- np->name);
+ np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
msleep(4000);
}
+ rtnl_lock();
}
- if (is_zero_ether_addr(np->local_mac) && ndev->dev_addr)
- memcpy(np->local_mac, ndev->dev_addr, 6);
+ if (!np->local_ip.ip) {
+ if (!np->ipv6) {
+ in_dev = __in_dev_get_rtnl(ndev);
- if (!np->local_ip) {
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(ndev);
+ if (!in_dev || !in_dev->ifa_list) {
+ np_err(np, "no IP address for %s, aborting\n",
+ np->dev_name);
+ err = -EDESTADDRREQ;
+ goto put;
+ }
- if (!in_dev || !in_dev->ifa_list) {
- rcu_read_unlock();
- printk(KERN_ERR "%s: no IP address for %s, aborting\n",
- np->name, np->dev_name);
- goto release;
+ np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *idev;
+
+ err = -EDESTADDRREQ;
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ np->dev_name);
+ goto put;
+ } else
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+#else
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ np->dev_name);
+ err = -EINVAL;
+ goto put;
+#endif
}
-
- np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
- rcu_read_unlock();
- printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
- np->name, HIPQUAD(np->local_ip));
- }
-
- if (np->rx_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- npinfo->rx_np = np;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
}
/* fill up the skb queue */
refill_skbs();
- /* last thing to do is link it to the net device structure */
- ndev->npinfo = npinfo;
-
- /* avoid racing with NAPI reading npinfo */
- synchronize_rcu();
+ err = __netpoll_setup(np, ndev);
+ if (err)
+ goto put;
+ rtnl_unlock();
return 0;
- release:
- if (!ndev->npinfo)
- kfree(npinfo);
- np->dev = NULL;
+put:
dev_put(ndev);
- return -1;
+unlock:
+ rtnl_unlock();
+ return err;
}
+EXPORT_SYMBOL(netpoll_setup);
-void netpoll_cleanup(struct netpoll *np)
+static int __init netpoll_init(void)
+{
+ skb_queue_head_init(&skb_pool);
+ return 0;
+}
+core_initcall(netpoll_init);
+
+static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
+{
+ struct netpoll_info *npinfo =
+ container_of(rcu_head, struct netpoll_info, rcu);
+
+ skb_queue_purge(&npinfo->txq);
+
+ /* we can't call cancel_delayed_work_sync here, as we are in softirq */
+ cancel_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ /* now cancel it again */
+ cancel_delayed_work(&npinfo->tx_work);
+ kfree(npinfo);
+}
+
+void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- unsigned long flags;
- if (np->dev) {
- npinfo = np->dev->npinfo;
- if (npinfo && npinfo->rx_np == np) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_np = NULL;
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
- dev_put(np->dev);
- }
+ /* rtnl_dereference would be preferable here but
+ * rcu_cleanup_netpoll path can put us in here safely without
+ * holding the rtnl, so plain rcu_dereference it is
+ */
+ npinfo = rtnl_dereference(np->dev->npinfo);
+ if (!npinfo)
+ return;
- np->dev = NULL;
+ synchronize_srcu(&netpoll_srcu);
+
+ if (atomic_dec_and_test(&npinfo->refcnt)) {
+ const struct net_device_ops *ops;
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
+
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
+ }
}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
-int netpoll_trap(void)
+static void netpoll_async_cleanup(struct work_struct *work)
{
- return atomic_read(&trapped);
+ struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+
+ rtnl_lock();
+ __netpoll_cleanup(np);
+ rtnl_unlock();
+ kfree(np);
}
-void netpoll_set_trap(int trap)
+void __netpoll_free_async(struct netpoll *np)
{
- if (trap)
- atomic_inc(&trapped);
- else
- atomic_dec(&trapped);
+ schedule_work(&np->cleanup_work);
}
+EXPORT_SYMBOL_GPL(__netpoll_free_async);
-EXPORT_SYMBOL(netpoll_set_trap);
-EXPORT_SYMBOL(netpoll_trap);
-EXPORT_SYMBOL(netpoll_parse_options);
-EXPORT_SYMBOL(netpoll_setup);
+void netpoll_cleanup(struct netpoll *np)
+{
+ rtnl_lock();
+ if (!np->dev)
+ goto out;
+ __netpoll_cleanup(np);
+ dev_put(np->dev);
+ np->dev = NULL;
+out:
+ rtnl_unlock();
+}
EXPORT_SYMBOL(netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_send_udp);
-EXPORT_SYMBOL(netpoll_poll);
-EXPORT_SYMBOL(netpoll_queue);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 00000000000..2f385b9bccc
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,288 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/fdtable.h>
+
+#define PRIOMAP_MIN_SZ 128
+
+/*
+ * Extend @dev->priomap so that it's large enough to accommodate
+ * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
+ * return. Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
+{
+ struct netprio_map *old, *new;
+ size_t new_sz, new_len;
+
+ /* is the existing priomap large enough? */
+ old = rtnl_dereference(dev->priomap);
+ if (old && old->priomap_len > target_idx)
+ return 0;
+
+ /*
+ * Determine the new size. Let's keep it power-of-two. We start
+ * from PRIOMAP_MIN_SZ and double it until it's large enough to
+ * accommodate @target_idx.
+ */
+ new_sz = PRIOMAP_MIN_SZ;
+ while (true) {
+ new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+ sizeof(new->priomap[0]);
+ if (new_len > target_idx)
+ break;
+ new_sz *= 2;
+ /* overflowed? */
+ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+ return -ENOSPC;
+ }
+
+ /* allocate & copy */
+ new = kzalloc(new_sz, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (old)
+ memcpy(new->priomap, old->priomap,
+ old->priomap_len * sizeof(old->priomap[0]));
+
+ new->priomap_len = new_len;
+
+ /* install the new priomap */
+ rcu_assign_pointer(dev->priomap, new);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
+{
+ struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+ int id = css->cgroup->id;
+
+ if (map && id < map->priomap_len)
+ return map->priomap[id];
+ return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup_subsys_state *css,
+ struct net_device *dev, u32 prio)
+{
+ struct netprio_map *map;
+ int id = css->cgroup->id;
+ int ret;
+
+ /* avoid extending priomap for zero writes */
+ map = rtnl_dereference(dev->priomap);
+ if (!prio && (!map || map->priomap_len <= id))
+ return 0;
+
+ ret = extend_netdev_table(dev, id);
+ if (ret)
+ return ret;
+
+ map = rtnl_dereference(dev->priomap);
+ map->priomap[id] = prio;
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kzalloc(sizeof(*css), GFP_KERNEL);
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *parent_css = css->parent;
+ struct net_device *dev;
+ int ret = 0;
+
+ if (!parent_css)
+ return 0;
+
+ rtnl_lock();
+ /*
+ * Inherit prios from the parent. As all prios are set during
+ * onlining, there is no need to clear them on offline.
+ */
+ for_each_netdev(&init_net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+ return ret;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css->cgroup->id;
+}
+
+static int read_priomap(struct seq_file *sf, void *v)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev)
+ seq_printf(sf, "%s %u\n", dev->name,
+ netprio_prio(seq_css(sf), dev));
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char devname[IFNAMSIZ + 1];
+ struct net_device *dev;
+ u32 prio;
+ int ret;
+
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ return -EINVAL;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ return -ENODEV;
+
+ rtnl_lock();
+
+ ret = netprio_set_prio(of_css(of), dev, prio);
+
+ rtnl_unlock();
+ dev_put(dev);
+ return ret ?: nbytes;
+}
+
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+ if (sock)
+ sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ return 0;
+}
+
+static void net_prio_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *p;
+ void *v = (void *)(unsigned long)css->cgroup->id;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_netprio, v);
+ task_unlock(p);
+ }
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .seq_show = read_priomap,
+ .write = write_priomap,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_prio_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = net_prio_attach,
+ .base_cftypes = ss_files,
+};
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netprio_map *old;
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ register_netdevice_notifier(&netprio_device_notifier);
+ return 0;
+}
+
+subsys_initcall(init_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index da16f8fd149..fc17a9d309a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -6,7 +6,7 @@
*
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
* Ben Greear <greearb@candelatech.com>
- * Jens Låås <jens.laas@data.slu.se>
+ * Jens Låås <jens.laas@data.slu.se>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -15,7 +15,7 @@
*
*
* A tool for loading the network with preconfigurated packets.
- * The tool is implemented as a linux module. Parameters are output
+ * The tool is implemented as a linux module. Parameters are output
* device, delay (to hard_xmit), number of packets, and whether
* to use multiple SKBs or just the same one.
* pktgen uses the installed interface's output routine.
@@ -44,14 +44,14 @@
* * Add IOCTL interface to easily get counters & configuration.
* --Ben Greear <greearb@candelatech.com>
*
- * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
- * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
* as a "fastpath" with a configurable number of clones after alloc's.
- * clone_skb=0 means all packets are allocated this also means ranges time
- * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
* clones.
*
- * Also moved to /proc/net/pktgen/
+ * Also moved to /proc/net/pktgen/
* --ro
*
* Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
@@ -60,28 +60,28 @@
*
* Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
*
- *
+ *
* 021124 Finished major redesign and rewrite for new functionality.
* See Documentation/networking/pktgen.txt for how to use this.
*
* The new operation:
- * For each CPU one thread/process is created at start. This process checks
- * for running devices in the if_list and sends packets until count is 0 it
- * also the thread checks the thread->control which is used for inter-process
- * communication. controlling process "posts" operations to the threads this
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
* way. The if_lock should be possible to remove when add/rem_device is merged
* into this too.
*
- * By design there should only be *one* "controlling" process. In practice
- * multiple write accesses gives unpredictable result. Understood by "write"
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
* to /proc gives result code thats should be read be the "writer".
* For practical use this should be no problem.
*
- * Note when adding devices to a specific CPU there good idea to also assign
- * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
* --ro
*
- * Fix refcount off by one if first packet fails, potential null deref,
+ * Fix refcount off by one if first packet fails, potential null deref,
* memleak 030710- KJP
*
* First "ranges" functionality for ipv6 030726 --ro
@@ -89,30 +89,41 @@
* Included flow support. 030802 ANK.
*
* Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
- *
+ *
* Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
* ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
*
- * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
* <shemminger@osdl.org> 040923
*
- * Randy Dunlap fixed u64 printk compiler waring
+ * Randy Dunlap fixed u64 printk compiler waring
*
* Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
* New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
*
- * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
* Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
*
- * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
* 050103
+ *
+ * MPLS support by Steven Whitehouse <steve@chygwyn.com>
+ *
+ * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
+ *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sys.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -123,8 +134,11 @@
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
+#include <linux/hrtimer.h>
+#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/timer.h>
+#include <linux/list.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -132,6 +146,7 @@
#include <linux/inetdevice.h>
#include <linux/rtnetlink.h>
#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -140,47 +155,58 @@
#include <linux/seq_file.h>
#include <linux/wait.h>
#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
-#include <asm/bitops.h>
-#include <asm/io.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
#include <asm/dma.h>
-#include <asm/uaccess.h>
-#include <asm/div64.h> /* do_div */
-#include <asm/timex.h>
-
-
-#define VERSION "pktgen v2.63: Packet Generator for packet performance testing.\n"
-
-/* #define PG_DEBUG(a) a */
-#define PG_DEBUG(a)
+#include <asm/div64.h> /* do_div */
-/* The buckets are exponential in 'width' */
-#define LAT_BUCKETS_MAX 32
+#define VERSION "2.74"
#define IP_NAME_SZ 32
+#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+
+#define func_enter() pr_debug("entering %s\n", __func__);
/* Device flag bits */
-#define F_IPSRC_RND (1<<0) /* IP-Src Random */
-#define F_IPDST_RND (1<<1) /* IP-Dst Random */
-#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
-#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
-#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
-#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
-#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
-#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
+#define F_IPSRC_RND (1<<0) /* IP-Src Random */
+#define F_IPDST_RND (1<<1) /* IP-Dst Random */
+#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
+#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
+#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
+#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
+#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
+#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
+#define F_MPLS_RND (1<<8) /* Random MPLS labels */
+#define F_VID_RND (1<<9) /* Random VLAN ID */
+#define F_SVID_RND (1<<10) /* Random SVLAN ID */
+#define F_FLOW_SEQ (1<<11) /* Sequential flows */
+#define F_IPSEC_ON (1<<12) /* ipsec on for flows */
+#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */
+#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
+#define F_NODE (1<<15) /* Node memory alloc*/
+#define F_UDPCSUM (1<<16) /* Include UDP checksum */
/* Thread control flag bits */
-#define T_TERMINATE (1<<0)
-#define T_STOP (1<<1) /* Stop run */
-#define T_RUN (1<<2) /* Start run */
-#define T_REMDEV (1<<3) /* Remove all devs */
-
-/* Locks */
-#define thread_lock() down(&pktgen_sem)
-#define thread_unlock() up(&pktgen_sem)
+#define T_STOP (1<<0) /* Stop run */
+#define T_RUN (1<<1) /* Start run */
+#define T_REMDEVALL (1<<2) /* Remove all devs */
+#define T_REMDEV (1<<3) /* Remove one dev */
/* If lock -- can be removed after some work */
#define if_lock(t) spin_lock(&(t->if_lock));
@@ -190,507 +216,543 @@
#define PKTGEN_MAGIC 0xbe9be955
#define PG_PROC_DIR "pktgen"
#define PGCTRL "pgctrl"
-static struct proc_dir_entry *pg_proc_dir = NULL;
#define MAX_CFLOWS 65536
-struct flow_state
-{
- __u32 cur_daddr;
- int count;
+#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
+#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+
+struct flow_state {
+ __be32 cur_daddr;
+ int count;
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x;
+#endif
+ __u32 flags;
};
-struct pktgen_dev {
+/* flow flag bits */
+#define F_INIT (1<<0) /* flow has been initialized */
+struct pktgen_dev {
/*
* Try to keep frequent/infrequent used vars. separated.
*/
+ struct proc_dir_entry *entry; /* proc file */
+ struct pktgen_thread *pg_thread;/* the owner */
+ struct list_head list; /* chaining in the thread's run-queue */
+
+ int running; /* if false, the test will stop */
- char ifname[IFNAMSIZ];
- char result[512];
-
- struct pktgen_thread* pg_thread; /* the owner */
- struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */
-
- int running; /* if this changes to false, the test will stop */
-
- /* If min != max, then we will either do a linear iteration, or
- * we will do a random selection from within the range.
- */
- __u32 flags;
-
- int min_pkt_size; /* = ETH_ZLEN; */
- int max_pkt_size; /* = ETH_ZLEN; */
- int nfrags;
- __u32 delay_us; /* Default delay */
- __u32 delay_ns;
- __u64 count; /* Default No packets to send */
- __u64 sofar; /* How many pkts we've sent so far */
- __u64 tx_bytes; /* How many bytes we've transmitted */
- __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */
-
- /* runtime counters relating to clone_skb */
- __u64 next_tx_us; /* timestamp of when to tx next */
- __u32 next_tx_ns;
-
- __u64 allocated_skbs;
- __u32 clone_count;
- int last_ok; /* Was last skb sent?
- * Or a failed transmit of some sort? This will keep
- * sequence numbers in order, for example.
- */
- __u64 started_at; /* micro-seconds */
- __u64 stopped_at; /* micro-seconds */
- __u64 idle_acc; /* micro-seconds */
- __u32 seq_num;
-
- int clone_skb; /* Use multiple SKBs during packet gen. If this number
- * is greater than 1, then that many copies of the same
- * packet will be sent before a new packet is allocated.
- * For instance, if you want to send 1024 identical packets
- * before creating a new packet, set clone_skb to 1024.
- */
-
- char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
- char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
- char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
- char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
-
- struct in6_addr in6_saddr;
- struct in6_addr in6_daddr;
- struct in6_addr cur_in6_daddr;
- struct in6_addr cur_in6_saddr;
+ /* If min != max, then we will either do a linear iteration, or
+ * we will do a random selection from within the range.
+ */
+ __u32 flags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
+ int min_pkt_size;
+ int max_pkt_size;
+ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
+ int nfrags;
+ struct page *page;
+ u64 delay; /* nano-seconds */
+
+ __u64 count; /* Default No packets to send */
+ __u64 sofar; /* How many pkts we've sent so far */
+ __u64 tx_bytes; /* How many bytes we've transmitted */
+ __u64 errors; /* Errors when trying to transmit, */
+
+ /* runtime counters relating to clone_skb */
+
+ __u64 allocated_skbs;
+ __u32 clone_count;
+ int last_ok; /* Was last skb sent?
+ * Or a failed transmit of some sort?
+ * This will keep sequence numbers in order
+ */
+ ktime_t next_tx;
+ ktime_t started_at;
+ ktime_t stopped_at;
+ u64 idle_acc; /* nano-seconds */
+
+ __u32 seq_num;
+
+ int clone_skb; /*
+ * Use multiple SKBs during packet gen.
+ * If this number is greater than 1, then
+ * that many copies of the same packet will be
+ * sent before a new packet is allocated.
+ * If you want to send 1024 identical packets
+ * before creating a new packet,
+ * set clone_skb to 1024.
+ */
+
+ char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+
+ struct in6_addr in6_saddr;
+ struct in6_addr in6_daddr;
+ struct in6_addr cur_in6_daddr;
+ struct in6_addr cur_in6_saddr;
/* For ranges */
- struct in6_addr min_in6_daddr;
- struct in6_addr max_in6_daddr;
- struct in6_addr min_in6_saddr;
- struct in6_addr max_in6_saddr;
-
- /* If we're doing ranges, random or incremental, then this
- * defines the min/max for those ranges.
- */
- __u32 saddr_min; /* inclusive, source IP address */
- __u32 saddr_max; /* exclusive, source IP address */
- __u32 daddr_min; /* inclusive, dest IP address */
- __u32 daddr_max; /* exclusive, dest IP address */
-
- __u16 udp_src_min; /* inclusive, source UDP port */
- __u16 udp_src_max; /* exclusive, source UDP port */
- __u16 udp_dst_min; /* inclusive, dest UDP port */
- __u16 udp_dst_max; /* exclusive, dest UDP port */
-
- __u32 src_mac_count; /* How many MACs to iterate through */
- __u32 dst_mac_count; /* How many MACs to iterate through */
-
- unsigned char dst_mac[ETH_ALEN];
- unsigned char src_mac[ETH_ALEN];
-
- __u32 cur_dst_mac_offset;
- __u32 cur_src_mac_offset;
- __u32 cur_saddr;
- __u32 cur_daddr;
- __u16 cur_udp_dst;
- __u16 cur_udp_src;
- __u32 cur_pkt_size;
-
- __u8 hh[14];
- /* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
- */
- __u16 pad; /* pad out the hh struct to an even 16 bytes */
-
- struct sk_buff* skb; /* skb we are to transmit next, mainly used for when we
- * are transmitting the same one multiple times
- */
- struct net_device* odev; /* The out-going device. Note that the device should
- * have it's pg_info pointer pointing back to this
- * device. This will be set when the user specifies
- * the out-going device name (not when the inject is
- * started as it used to do.)
- */
+ struct in6_addr min_in6_daddr;
+ struct in6_addr max_in6_daddr;
+ struct in6_addr min_in6_saddr;
+ struct in6_addr max_in6_saddr;
+
+ /* If we're doing ranges, random or incremental, then this
+ * defines the min/max for those ranges.
+ */
+ __be32 saddr_min; /* inclusive, source IP address */
+ __be32 saddr_max; /* exclusive, source IP address */
+ __be32 daddr_min; /* inclusive, dest IP address */
+ __be32 daddr_max; /* exclusive, dest IP address */
+
+ __u16 udp_src_min; /* inclusive, source UDP port */
+ __u16 udp_src_max; /* exclusive, source UDP port */
+ __u16 udp_dst_min; /* inclusive, dest UDP port */
+ __u16 udp_dst_max; /* exclusive, dest UDP port */
+
+ /* DSCP + ECN */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ are for dscp codepoint */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ (see RFC 3260, sec. 4) */
+
+ /* MPLS */
+ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
+ __be32 labels[MAX_MPLS_LABELS];
+
+ /* VLAN/SVLAN (802.1Q/Q-in-Q) */
+ __u8 vlan_p;
+ __u8 vlan_cfi;
+ __u16 vlan_id; /* 0xffff means no vlan tag */
+
+ __u8 svlan_p;
+ __u8 svlan_cfi;
+ __u16 svlan_id; /* 0xffff means no svlan tag */
+
+ __u32 src_mac_count; /* How many MACs to iterate through */
+ __u32 dst_mac_count; /* How many MACs to iterate through */
+
+ unsigned char dst_mac[ETH_ALEN];
+ unsigned char src_mac[ETH_ALEN];
+
+ __u32 cur_dst_mac_offset;
+ __u32 cur_src_mac_offset;
+ __be32 cur_saddr;
+ __be32 cur_daddr;
+ __u16 ip_id;
+ __u16 cur_udp_dst;
+ __u16 cur_udp_src;
+ __u16 cur_queue_map;
+ __u32 cur_pkt_size;
+ __u32 last_pkt_size;
+
+ __u8 hh[14];
+ /* = {
+ 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+
+ We fill in SRC address later
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x08, 0x00
+ };
+ */
+ __u16 pad; /* pad out the hh struct to an even 16 bytes */
+
+ struct sk_buff *skb; /* skb we are to transmit next, used for when we
+ * are transmitting the same one multiple times
+ */
+ struct net_device *odev; /* The out-going device.
+ * Note that the device should have it's
+ * pg_info pointer pointing back to this
+ * device.
+ * Set when the user specifies the out-going
+ * device name (not when the inject is
+ * started as it used to do.)
+ */
+ char odevname[32];
struct flow_state *flows;
- unsigned cflows; /* Concurrent flows (config) */
- unsigned lflow; /* Flow length (config) */
- unsigned nflows; /* accumulated flows (stats) */
+ unsigned int cflows; /* Concurrent flows (config) */
+ unsigned int lflow; /* Flow length (config) */
+ unsigned int nflows; /* accumulated flows (stats) */
+ unsigned int curfl; /* current sequenced flow (state)*/
+
+ u16 queue_map_min;
+ u16 queue_map_max;
+ __u32 skb_priority; /* skb priority field */
+ int node; /* Memory node */
+
+#ifdef CONFIG_XFRM
+ __u8 ipsmode; /* IPSEC mode (config) */
+ __u8 ipsproto; /* IPSEC type (config) */
+ __u32 spi;
+ struct dst_entry dst;
+ struct dst_ops dstops;
+#endif
+ char result[512];
};
struct pktgen_hdr {
- __u32 pgh_magic;
- __u32 seq_num;
- __u32 tv_sec;
- __u32 tv_usec;
+ __be32 pgh_magic;
+ __be32 seq_num;
+ __be32 tv_sec;
+ __be32 tv_usec;
};
-struct pktgen_thread {
- spinlock_t if_lock;
- struct pktgen_dev *if_list; /* All device here */
- struct pktgen_thread* next;
- char name[32];
- char result[512];
- u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */
-
- /* Field for thread to receive "posted" events terminate, stop ifs etc.*/
-
- u32 control;
- int pid;
- int cpu;
-
- wait_queue_head_t queue;
-};
-#define REMOVE 1
-#define FIND 0
+static int pg_net_id __read_mostly;
-/* This code works around the fact that do_div cannot handle two 64-bit
- numbers, and regular 64-bit division doesn't work on x86 kernels.
- --Ben
-*/
-
-#define PG_DIV 0
-
-/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net>
- * Function copied/adapted/optimized from:
- *
- * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html
- *
- * Copyright 1994, University of Cambridge Computer Laboratory
- * All Rights Reserved.
- *
- */
-static inline s64 divremdi3(s64 x, s64 y, int type)
-{
- u64 a = (x < 0) ? -x : x;
- u64 b = (y < 0) ? -y : y;
- u64 res = 0, d = 1;
-
- if (b > 0) {
- while (b < a) {
- b <<= 1;
- d <<= 1;
- }
- }
-
- do {
- if ( a >= b ) {
- a -= b;
- res += d;
- }
- b >>= 1;
- d >>= 1;
- }
- while (d);
-
- if (PG_DIV == type) {
- return (((x ^ y) & (1ll<<63)) == 0) ? res : -(s64)res;
- }
- else {
- return ((x & (1ll<<63)) == 0) ? a : -(s64)a;
- }
-}
-
-/* End of hacks to deal with 64-bit math on x86 */
-
-/** Convert to milliseconds */
-static inline __u64 tv_to_ms(const struct timeval* tv)
-{
- __u64 ms = tv->tv_usec / 1000;
- ms += (__u64)tv->tv_sec * (__u64)1000;
- return ms;
-}
-
-
-/** Convert to micro-seconds */
-static inline __u64 tv_to_us(const struct timeval* tv)
-{
- __u64 us = tv->tv_usec;
- us += (__u64)tv->tv_sec * (__u64)1000000;
- return us;
-}
-
-static inline __u64 pg_div(__u64 n, __u32 base) {
- __u64 tmp = n;
- do_div(tmp, base);
- /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n",
- n, base, tmp); */
- return tmp;
-}
-
-static inline __u64 pg_div64(__u64 n, __u64 base)
-{
- __u64 tmp = n;
-/*
- * How do we know if the architecture we are running on
- * supports division with 64 bit base?
- *
- */
-#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__)
-
- do_div(tmp, base);
-#else
- tmp = divremdi3(n, base, PG_DIV);
-#endif
- return tmp;
-}
-
-static inline u32 pktgen_random(void)
-{
-#if 0
- __u32 n;
- get_random_bytes(&n, 4);
- return n;
-#else
- return net_random();
-#endif
-}
+struct pktgen_net {
+ struct net *net;
+ struct proc_dir_entry *proc_dir;
+ struct list_head pktgen_threads;
+ bool pktgen_exiting;
+};
-static inline __u64 getCurMs(void)
-{
- struct timeval tv;
- do_gettimeofday(&tv);
- return tv_to_ms(&tv);
-}
+struct pktgen_thread {
+ spinlock_t if_lock; /* for list of devices */
+ struct list_head if_list; /* All device here */
+ struct list_head th_list;
+ struct task_struct *tsk;
+ char result[512];
-static inline __u64 getCurUs(void)
-{
- struct timeval tv;
- do_gettimeofday(&tv);
- return tv_to_us(&tv);
-}
+ /* Field for thread to receive "posted" events terminate,
+ stop ifs etc. */
-static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b)
-{
- return tv_to_us(a) - tv_to_us(b);
-}
+ u32 control;
+ int cpu;
+ wait_queue_head_t queue;
+ struct completion start_done;
+ struct pktgen_net *net;
+};
-/* old include end */
+#define REMOVE 1
+#define FIND 0
-static char version[] __initdata = VERSION;
+static const char version[] =
+ "Packet Generator for packet performance testing. "
+ "Version: " VERSION "\n";
-static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i);
-static int pktgen_add_device(struct pktgen_thread* t, const char* ifname);
-static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname);
+static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact);
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
-static void pktgen_run_all_threads(void);
-static void pktgen_stop_all_threads_ifs(void);
-static int pktgen_stop_device(struct pktgen_dev *pkt_dev);
-static void pktgen_stop(struct pktgen_thread* t);
+static void pktgen_run_all_threads(struct pktgen_net *pn);
+static void pktgen_reset_all_threads(struct pktgen_net *pn);
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+
+static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
-static struct pktgen_dev *pktgen_NN_threads(const char* dev_name, int remove);
-static unsigned int scan_ip6(const char *s,char ip[16]);
-static unsigned int fmt_ip6(char *s,const char ip[16]);
/* Module parameters, defaults. */
-static int pg_count_d = 1000; /* 1000 pkts by default */
-static int pg_delay_d;
-static int pg_clone_skb_d;
-static int debug;
+static int pg_count_d __read_mostly = 1000;
+static int pg_delay_d __read_mostly;
+static int pg_clone_skb_d __read_mostly;
+static int debug __read_mostly;
-static DECLARE_MUTEX(pktgen_sem);
-static struct pktgen_thread *pktgen_threads = NULL;
+static DEFINE_MUTEX(pktgen_thread_lock);
static struct notifier_block pktgen_notifier_block = {
.notifier_call = pktgen_device_event,
};
/*
- * /proc handling functions
+ * /proc handling functions
*
*/
static int pgctrl_show(struct seq_file *seq, void *v)
-{
- seq_puts(seq, VERSION);
+{
+ seq_puts(seq, version);
return 0;
}
-static ssize_t pgctrl_write(struct file* file,const char __user * buf,
+static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- int err = 0;
char data[128];
+ struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
- if (!capable(CAP_NET_ADMIN)){
- err = -EPERM;
- goto out;
- }
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
if (count > sizeof(data))
count = sizeof(data);
- if (copy_from_user(data, buf, count)) {
- err = -EFAULT;
- goto out;
- }
- data[count-1] = 0; /* Make string */
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
- if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs();
+ if (!strcmp(data, "stop"))
+ pktgen_stop_all_threads_ifs(pn);
- else if (!strcmp(data, "start"))
- pktgen_run_all_threads();
+ else if (!strcmp(data, "start"))
+ pktgen_run_all_threads(pn);
- else
- printk("pktgen: Unknown command: %s\n", data);
+ else if (!strcmp(data, "reset"))
+ pktgen_reset_all_threads(pn);
- err = count;
+ else
+ pr_warning("Unknown command: %s\n", data);
- out:
- return err;
+ return count;
}
static int pgctrl_open(struct inode *inode, struct file *file)
{
- return single_open(file, pgctrl_show, PDE(inode)->data);
+ return single_open(file, pgctrl_show, PDE_DATA(inode));
}
-static struct file_operations pktgen_fops = {
- .owner = THIS_MODULE,
- .open = pgctrl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pgctrl_write,
- .release = single_release,
+static const struct file_operations pktgen_fops = {
+ .owner = THIS_MODULE,
+ .open = pgctrl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pgctrl_write,
+ .release = single_release,
};
static int pktgen_if_show(struct seq_file *seq, void *v)
{
- int i;
- struct pktgen_dev *pkt_dev = seq->private;
- __u64 sa;
- __u64 stopped;
- __u64 now = getCurUs();
-
- seq_printf(seq, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
- (unsigned long long) pkt_dev->count,
- pkt_dev->min_pkt_size, pkt_dev->max_pkt_size);
-
- seq_printf(seq, " frags: %d delay: %u clone_skb: %d ifname: %s\n",
- pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname);
-
- seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow);
-
-
- if(pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128], b3[128];
- fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
- seq_printf(seq, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3);
-
- fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
- seq_printf(seq, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3);
-
- }
- else
- seq_printf(seq," dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
- pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max);
+ const struct pktgen_dev *pkt_dev = seq->private;
+ ktime_t stopped;
+ u64 idle;
+
+ seq_printf(seq,
+ "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
+ (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size);
+
+ seq_printf(seq,
+ " frags: %d delay: %llu clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
+ pkt_dev->clone_skb, pkt_dev->odevname);
+
+ seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows,
+ pkt_dev->lflow);
+
+ seq_printf(seq,
+ " queue_map_min: %u queue_map_max: %u\n",
+ pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+
+ if (pkt_dev->skb_priority)
+ seq_printf(seq, " skb_priority: %u\n",
+ pkt_dev->skb_priority);
+
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq,
+ " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
+ " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
+ &pkt_dev->in6_saddr,
+ &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+ &pkt_dev->in6_daddr,
+ &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
+ } else {
+ seq_printf(seq,
+ " dst_min: %s dst_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max);
+ seq_printf(seq,
+ " src_min: %s src_max: %s\n",
+ pkt_dev->src_min, pkt_dev->src_max);
+ }
seq_puts(seq, " src_mac: ");
- if (is_zero_ether_addr(pkt_dev->src_mac))
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":");
- else
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":");
+ seq_printf(seq, "%pM ",
+ is_zero_ether_addr(pkt_dev->src_mac) ?
+ pkt_dev->odev->dev_addr : pkt_dev->src_mac);
- seq_printf(seq, "dst_mac: ");
- for (i = 0; i < 6; i++)
- seq_printf(seq, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":");
+ seq_puts(seq, "dst_mac: ");
+ seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
- seq_printf(seq, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
- pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min,
- pkt_dev->udp_dst_max);
+ seq_printf(seq,
+ " udp_src_min: %d udp_src_max: %d"
+ " udp_dst_min: %d udp_dst_max: %d\n",
+ pkt_dev->udp_src_min, pkt_dev->udp_src_max,
+ pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
- seq_printf(seq, " src_mac_count: %d dst_mac_count: %d \n Flags: ",
+ seq_printf(seq,
+ " src_mac_count: %d dst_mac_count: %d\n",
pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+ if (pkt_dev->nr_labels) {
+ unsigned int i;
+ seq_puts(seq, " mpls: ");
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
+ i == pkt_dev->nr_labels-1 ? "\n" : ", ");
+ }
+
+ if (pkt_dev->vlan_id != 0xffff)
+ seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->vlan_id, pkt_dev->vlan_p,
+ pkt_dev->vlan_cfi);
+
+ if (pkt_dev->svlan_id != 0xffff)
+ seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->svlan_id, pkt_dev->svlan_p,
+ pkt_dev->svlan_cfi);
+
+ if (pkt_dev->tos)
+ seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos);
+
+ if (pkt_dev->traffic_class)
+ seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
- if (pkt_dev->flags & F_IPV6)
- seq_printf(seq, "IPV6 ");
-
- if (pkt_dev->flags & F_IPSRC_RND)
- seq_printf(seq, "IPSRC_RND ");
-
- if (pkt_dev->flags & F_IPDST_RND)
- seq_printf(seq, "IPDST_RND ");
-
- if (pkt_dev->flags & F_TXSIZE_RND)
- seq_printf(seq, "TXSIZE_RND ");
-
- if (pkt_dev->flags & F_UDPSRC_RND)
- seq_printf(seq, "UDPSRC_RND ");
-
- if (pkt_dev->flags & F_UDPDST_RND)
- seq_printf(seq, "UDPDST_RND ");
-
- if (pkt_dev->flags & F_MACSRC_RND)
- seq_printf(seq, "MACSRC_RND ");
-
- if (pkt_dev->flags & F_MACDST_RND)
- seq_printf(seq, "MACDST_RND ");
-
-
- seq_puts(seq, "\n");
-
- sa = pkt_dev->started_at;
- stopped = pkt_dev->stopped_at;
- if (pkt_dev->running)
- stopped = now; /* not really stopped, more like last-running-at */
-
- seq_printf(seq, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
- (unsigned long long) pkt_dev->sofar,
- (unsigned long long) pkt_dev->errors,
- (unsigned long long) sa,
- (unsigned long long) stopped,
- (unsigned long long) pkt_dev->idle_acc);
-
- seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
+ if (pkt_dev->node >= 0)
+ seq_printf(seq, " node: %d\n", pkt_dev->node);
+
+ seq_puts(seq, " Flags: ");
+
+ if (pkt_dev->flags & F_IPV6)
+ seq_puts(seq, "IPV6 ");
+
+ if (pkt_dev->flags & F_IPSRC_RND)
+ seq_puts(seq, "IPSRC_RND ");
+
+ if (pkt_dev->flags & F_IPDST_RND)
+ seq_puts(seq, "IPDST_RND ");
+
+ if (pkt_dev->flags & F_TXSIZE_RND)
+ seq_puts(seq, "TXSIZE_RND ");
+
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ seq_puts(seq, "UDPSRC_RND ");
+
+ if (pkt_dev->flags & F_UDPDST_RND)
+ seq_puts(seq, "UDPDST_RND ");
+
+ if (pkt_dev->flags & F_UDPCSUM)
+ seq_puts(seq, "UDPCSUM ");
+
+ if (pkt_dev->flags & F_MPLS_RND)
+ seq_puts(seq, "MPLS_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_RND)
+ seq_puts(seq, "QUEUE_MAP_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ seq_puts(seq, "QUEUE_MAP_CPU ");
+
+ if (pkt_dev->cflows) {
+ if (pkt_dev->flags & F_FLOW_SEQ)
+ seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/
+ else
+ seq_puts(seq, "FLOW_RND ");
+ }
+
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ seq_puts(seq, "IPSEC ");
+ if (pkt_dev->spi)
+ seq_printf(seq, "spi:%u", pkt_dev->spi);
+ }
+#endif
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ seq_puts(seq, "MACSRC_RND ");
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ seq_puts(seq, "MACDST_RND ");
+
+ if (pkt_dev->flags & F_VID_RND)
+ seq_puts(seq, "VID_RND ");
+
+ if (pkt_dev->flags & F_SVID_RND)
+ seq_puts(seq, "SVID_RND ");
+
+ if (pkt_dev->flags & F_NODE)
+ seq_puts(seq, "NODE_ALLOC ");
+
+ seq_puts(seq, "\n");
+
+ /* not really stopped, more like last-running-at */
+ stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
+ idle = pkt_dev->idle_acc;
+ do_div(idle, NSEC_PER_USEC);
+
+ seq_printf(seq,
+ "Current:\n pkts-sofar: %llu errors: %llu\n",
+ (unsigned long long)pkt_dev->sofar,
+ (unsigned long long)pkt_dev->errors);
+
+ seq_printf(seq,
+ " started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long) ktime_to_us(pkt_dev->started_at),
+ (unsigned long long) ktime_to_us(stopped),
+ (unsigned long long) idle);
+
+ seq_printf(seq,
+ " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
pkt_dev->cur_src_mac_offset);
- if(pkt_dev->flags & F_IPV6) {
- char b1[128], b2[128];
- fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
- fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
- seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
- }
- else
- seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
- pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
+ &pkt_dev->cur_in6_saddr,
+ &pkt_dev->cur_in6_daddr);
+ } else
+ seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n",
+ &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
-
- seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
+ seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
- seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
+ seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
+ seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
if (pkt_dev->result[0])
- seq_printf(seq, "Result: %s\n", pkt_dev->result);
+ seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- seq_printf(seq, "Result: Idle\n");
+ seq_puts(seq, "Result: Idle\n");
return 0;
}
-static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen)
+static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
+ __u32 *num)
+{
+ int i = 0;
+ *num = 0;
+
+ for (; i < maxlen; i++) {
+ int value;
+ char c;
+ *num <<= 4;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ value = hex_to_bin(c);
+ if (value >= 0)
+ *num |= value;
+ else
+ break;
+ }
+ return i;
+}
+
+static int count_trail_chars(const char __user * user_buffer,
+ unsigned int maxlen)
{
int i;
for (i = 0; i < maxlen; i++) {
- char c;
- if (get_user(c, &user_buffer[i]))
- return -EFAULT;
- switch (c) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
case '\"':
case '\n':
case '\r':
@@ -700,40 +762,40 @@ static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen
break;
default:
goto done;
- };
+ }
}
done:
return i;
}
-static unsigned long num_arg(const char __user *user_buffer, unsigned long maxlen,
- unsigned long *num)
+static long num_arg(const char __user *user_buffer, unsigned long maxlen,
+ unsigned long *num)
{
- int i = 0;
+ int i;
*num = 0;
-
- for(; i < maxlen; i++) {
- char c;
- if (get_user(c, &user_buffer[i]))
- return -EFAULT;
- if ((c >= '0') && (c <= '9')) {
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ if ((c >= '0') && (c <= '9')) {
*num *= 10;
- *num += c -'0';
+ *num += c - '0';
} else
break;
}
return i;
}
-static int strn_len(const char __user *user_buffer, unsigned int maxlen)
+static int strn_len(const char __user * user_buffer, unsigned int maxlen)
{
- int i = 0;
+ int i;
- for(; i < maxlen; i++) {
- char c;
- if (get_user(c, &user_buffer[i]))
- return -EFAULT;
- switch (c) {
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
case '\"':
case '\n':
case '\r':
@@ -743,122 +805,166 @@ static int strn_len(const char __user *user_buffer, unsigned int maxlen)
break;
default:
break;
- };
+ }
}
done_str:
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+{
+ unsigned int n = 0;
+ char c;
+ ssize_t i = 0;
+ int len;
+
+ pkt_dev->nr_labels = 0;
+ do {
+ __u32 tmp;
+ len = hex32_arg(&buffer[i], 8, &tmp);
+ if (len <= 0)
+ return len;
+ pkt_dev->labels[n] = htonl(tmp);
+ if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
+ pkt_dev->flags |= F_MPLS_RND;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ n++;
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+ } while (c == ',');
+ pkt_dev->nr_labels = n;
return i;
}
-static ssize_t pktgen_if_write(struct file *file, const char __user *user_buffer,
- size_t count, loff_t *offset)
+static ssize_t pktgen_if_write(struct file *file,
+ const char __user * user_buffer, size_t count,
+ loff_t * offset)
{
- struct seq_file *seq = (struct seq_file *) file->private_data;
- struct pktgen_dev *pkt_dev = seq->private;
- int i = 0, max, len;
+ struct seq_file *seq = file->private_data;
+ struct pktgen_dev *pkt_dev = seq->private;
+ int i, max, len;
char name[16], valstr[32];
unsigned long value = 0;
- char* pg_result = NULL;
- int tmp = 0;
+ char *pg_result = NULL;
+ int tmp = 0;
char buf[128];
-
- pg_result = &(pkt_dev->result[0]);
-
+
+ pg_result = &(pkt_dev->result[0]);
+
if (count < 1) {
- printk("pktgen: wrong command format\n");
+ pr_warning("wrong command format\n");
return -EINVAL;
}
-
- max = count - i;
- tmp = count_trail_chars(&user_buffer[i], max);
- if (tmp < 0) {
- printk("pktgen: illegal format\n");
- return tmp;
+
+ max = count;
+ tmp = count_trail_chars(user_buffer, max);
+ if (tmp < 0) {
+ pr_warning("illegal format\n");
+ return tmp;
}
- i += tmp;
-
+ i = tmp;
+
/* Read variable name */
len = strn_len(&user_buffer[i], sizeof(name) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
memset(name, 0, sizeof(name));
- if (copy_from_user(name, &user_buffer[i], len) )
+ if (copy_from_user(name, &user_buffer[i], len))
return -EFAULT;
i += len;
-
- max = count -i;
+
+ max = count - i;
len = count_trail_chars(&user_buffer[i], max);
- if (len < 0)
- return len;
-
+ if (len < 0)
+ return len;
+
i += len;
if (debug) {
- char tb[count + 1];
- if (copy_from_user(tb, user_buffer, count))
+ size_t copy = min_t(size_t, count, 1023);
+ char tb[copy + 1];
+ if (copy_from_user(tb, user_buffer, copy))
return -EFAULT;
- tb[count] = 0;
- printk("pktgen: %s,%lu buffer -:%s:-\n", name,
- (unsigned long) count, tb);
- }
+ tb[copy] = 0;
+ pr_debug("%s,%lu buffer -:%s:-\n",
+ name, (unsigned long)count, tb);
+ }
if (!strcmp(name, "min_pkt_size")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value < 14+20+8)
- value = 14+20+8;
- if (value != pkt_dev->min_pkt_size) {
- pkt_dev->min_pkt_size = value;
- pkt_dev->cur_pkt_size = value;
- }
- sprintf(pg_result, "OK: min_pkt_size=%u", pkt_dev->min_pkt_size);
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: min_pkt_size=%u",
+ pkt_dev->min_pkt_size);
return count;
}
- if (!strcmp(name, "max_pkt_size")) {
+ if (!strcmp(name, "max_pkt_size")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value < 14+20+8)
- value = 14+20+8;
- if (value != pkt_dev->max_pkt_size) {
- pkt_dev->max_pkt_size = value;
- pkt_dev->cur_pkt_size = value;
- }
- sprintf(pg_result, "OK: max_pkt_size=%u", pkt_dev->max_pkt_size);
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->max_pkt_size) {
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: max_pkt_size=%u",
+ pkt_dev->max_pkt_size);
return count;
}
- /* Shortcut for min = max */
+ /* Shortcut for min = max */
if (!strcmp(name, "pkt_size")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value < 14+20+8)
- value = 14+20+8;
- if (value != pkt_dev->min_pkt_size) {
- pkt_dev->min_pkt_size = value;
- pkt_dev->max_pkt_size = value;
- pkt_dev->cur_pkt_size = value;
- }
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
return count;
}
- if (!strcmp(name, "debug")) {
+ if (!strcmp(name, "debug")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- debug = value;
+ debug = value;
sprintf(pg_result, "OK: debug=%u", debug);
return count;
}
- if (!strcmp(name, "frags")) {
+ if (!strcmp(name, "frags")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
pkt_dev->nfrags = value;
sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
@@ -866,403 +972,510 @@ static ssize_t pktgen_if_write(struct file *file, const char __user *user_buffer
}
if (!strcmp(name, "delay")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value == 0x7FFFFFFF) {
- pkt_dev->delay_us = 0x7FFFFFFF;
- pkt_dev->delay_ns = 0;
- } else {
- pkt_dev->delay_us = value / 1000;
- pkt_dev->delay_ns = value % 1000;
- }
- sprintf(pg_result, "OK: delay=%u", 1000*pkt_dev->delay_us+pkt_dev->delay_ns);
+ if (value == 0x7FFFFFFF)
+ pkt_dev->delay = ULLONG_MAX;
+ else
+ pkt_dev->delay = (u64)value;
+
+ sprintf(pg_result, "OK: delay=%llu",
+ (unsigned long long) pkt_dev->delay);
return count;
}
- if (!strcmp(name, "udp_src_min")) {
+ if (!strcmp(name, "rate")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value != pkt_dev->udp_src_min) {
- pkt_dev->udp_src_min = value;
- pkt_dev->cur_udp_src = value;
- }
+ if (!value)
+ return len;
+ pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "ratep")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (!value)
+ return len;
+ pkt_dev->delay = NSEC_PER_SEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_src_min) {
+ pkt_dev->udp_src_min = value;
+ pkt_dev->cur_udp_src = value;
+ }
sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
return count;
}
- if (!strcmp(name, "udp_dst_min")) {
+ if (!strcmp(name, "udp_dst_min")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value != pkt_dev->udp_dst_min) {
- pkt_dev->udp_dst_min = value;
- pkt_dev->cur_udp_dst = value;
- }
+ if (value != pkt_dev->udp_dst_min) {
+ pkt_dev->udp_dst_min = value;
+ pkt_dev->cur_udp_dst = value;
+ }
sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
return count;
}
- if (!strcmp(name, "udp_src_max")) {
+ if (!strcmp(name, "udp_src_max")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value != pkt_dev->udp_src_max) {
- pkt_dev->udp_src_max = value;
- pkt_dev->cur_udp_src = value;
- }
+ if (value != pkt_dev->udp_src_max) {
+ pkt_dev->udp_src_max = value;
+ pkt_dev->cur_udp_src = value;
+ }
sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
return count;
}
- if (!strcmp(name, "udp_dst_max")) {
+ if (!strcmp(name, "udp_dst_max")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
- if (value != pkt_dev->udp_dst_max) {
- pkt_dev->udp_dst_max = value;
- pkt_dev->cur_udp_dst = value;
- }
+ if (value != pkt_dev->udp_dst_max) {
+ pkt_dev->udp_dst_max = value;
+ pkt_dev->cur_udp_dst = value;
+ }
sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
return count;
}
if (!strcmp(name, "clone_skb")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+ if ((value > 0) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -ENOTSUPP;
i += len;
- pkt_dev->clone_skb = value;
-
+ pkt_dev->clone_skb = value;
+
sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
return count;
}
if (!strcmp(name, "count")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
pkt_dev->count = value;
sprintf(pg_result, "OK: count=%llu",
- (unsigned long long) pkt_dev->count);
+ (unsigned long long)pkt_dev->count);
return count;
}
if (!strcmp(name, "src_mac_count")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
if (pkt_dev->src_mac_count != value) {
- pkt_dev->src_mac_count = value;
- pkt_dev->cur_src_mac_offset = 0;
- }
- sprintf(pg_result, "OK: src_mac_count=%d", pkt_dev->src_mac_count);
+ pkt_dev->src_mac_count = value;
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: src_mac_count=%d",
+ pkt_dev->src_mac_count);
return count;
}
if (!strcmp(name, "dst_mac_count")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
if (pkt_dev->dst_mac_count != value) {
- pkt_dev->dst_mac_count = value;
- pkt_dev->cur_dst_mac_offset = 0;
- }
- sprintf(pg_result, "OK: dst_mac_count=%d", pkt_dev->dst_mac_count);
+ pkt_dev->dst_mac_count = value;
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: dst_mac_count=%d",
+ pkt_dev->dst_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "node")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (node_possible(value)) {
+ pkt_dev->node = value;
+ sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
+ }
+ else
+ sprintf(pg_result, "ERROR: node not possible");
return count;
}
if (!strcmp(name, "flag")) {
- char f[32];
- memset(f, 0, 32);
+ char f[32];
+ memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
i += len;
- if (strcmp(f, "IPSRC_RND") == 0)
- pkt_dev->flags |= F_IPSRC_RND;
-
- else if (strcmp(f, "!IPSRC_RND") == 0)
- pkt_dev->flags &= ~F_IPSRC_RND;
-
- else if (strcmp(f, "TXSIZE_RND") == 0)
- pkt_dev->flags |= F_TXSIZE_RND;
-
- else if (strcmp(f, "!TXSIZE_RND") == 0)
- pkt_dev->flags &= ~F_TXSIZE_RND;
-
- else if (strcmp(f, "IPDST_RND") == 0)
- pkt_dev->flags |= F_IPDST_RND;
-
- else if (strcmp(f, "!IPDST_RND") == 0)
- pkt_dev->flags &= ~F_IPDST_RND;
-
- else if (strcmp(f, "UDPSRC_RND") == 0)
- pkt_dev->flags |= F_UDPSRC_RND;
-
- else if (strcmp(f, "!UDPSRC_RND") == 0)
- pkt_dev->flags &= ~F_UDPSRC_RND;
-
- else if (strcmp(f, "UDPDST_RND") == 0)
- pkt_dev->flags |= F_UDPDST_RND;
-
- else if (strcmp(f, "!UDPDST_RND") == 0)
- pkt_dev->flags &= ~F_UDPDST_RND;
-
- else if (strcmp(f, "MACSRC_RND") == 0)
- pkt_dev->flags |= F_MACSRC_RND;
-
- else if (strcmp(f, "!MACSRC_RND") == 0)
- pkt_dev->flags &= ~F_MACSRC_RND;
-
- else if (strcmp(f, "MACDST_RND") == 0)
- pkt_dev->flags |= F_MACDST_RND;
-
- else if (strcmp(f, "!MACDST_RND") == 0)
- pkt_dev->flags &= ~F_MACDST_RND;
-
- else {
- sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, TXSIZE_RND, UDPSRC_RND, UDPDST_RND, MACSRC_RND, MACDST_RND\n");
- return count;
- }
+ if (strcmp(f, "IPSRC_RND") == 0)
+ pkt_dev->flags |= F_IPSRC_RND;
+
+ else if (strcmp(f, "!IPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_IPSRC_RND;
+
+ else if (strcmp(f, "TXSIZE_RND") == 0)
+ pkt_dev->flags |= F_TXSIZE_RND;
+
+ else if (strcmp(f, "!TXSIZE_RND") == 0)
+ pkt_dev->flags &= ~F_TXSIZE_RND;
+
+ else if (strcmp(f, "IPDST_RND") == 0)
+ pkt_dev->flags |= F_IPDST_RND;
+
+ else if (strcmp(f, "!IPDST_RND") == 0)
+ pkt_dev->flags &= ~F_IPDST_RND;
+
+ else if (strcmp(f, "UDPSRC_RND") == 0)
+ pkt_dev->flags |= F_UDPSRC_RND;
+
+ else if (strcmp(f, "!UDPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_UDPSRC_RND;
+
+ else if (strcmp(f, "UDPDST_RND") == 0)
+ pkt_dev->flags |= F_UDPDST_RND;
+
+ else if (strcmp(f, "!UDPDST_RND") == 0)
+ pkt_dev->flags &= ~F_UDPDST_RND;
+
+ else if (strcmp(f, "MACSRC_RND") == 0)
+ pkt_dev->flags |= F_MACSRC_RND;
+
+ else if (strcmp(f, "!MACSRC_RND") == 0)
+ pkt_dev->flags &= ~F_MACSRC_RND;
+
+ else if (strcmp(f, "MACDST_RND") == 0)
+ pkt_dev->flags |= F_MACDST_RND;
+
+ else if (strcmp(f, "!MACDST_RND") == 0)
+ pkt_dev->flags &= ~F_MACDST_RND;
+
+ else if (strcmp(f, "MPLS_RND") == 0)
+ pkt_dev->flags |= F_MPLS_RND;
+
+ else if (strcmp(f, "!MPLS_RND") == 0)
+ pkt_dev->flags &= ~F_MPLS_RND;
+
+ else if (strcmp(f, "VID_RND") == 0)
+ pkt_dev->flags |= F_VID_RND;
+
+ else if (strcmp(f, "!VID_RND") == 0)
+ pkt_dev->flags &= ~F_VID_RND;
+
+ else if (strcmp(f, "SVID_RND") == 0)
+ pkt_dev->flags |= F_SVID_RND;
+
+ else if (strcmp(f, "!SVID_RND") == 0)
+ pkt_dev->flags &= ~F_SVID_RND;
+
+ else if (strcmp(f, "FLOW_SEQ") == 0)
+ pkt_dev->flags |= F_FLOW_SEQ;
+
+ else if (strcmp(f, "QUEUE_MAP_RND") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_CPU;
+
+ else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
+#ifdef CONFIG_XFRM
+ else if (strcmp(f, "IPSEC") == 0)
+ pkt_dev->flags |= F_IPSEC_ON;
+#endif
+
+ else if (strcmp(f, "!IPV6") == 0)
+ pkt_dev->flags &= ~F_IPV6;
+
+ else if (strcmp(f, "NODE_ALLOC") == 0)
+ pkt_dev->flags |= F_NODE;
+
+ else if (strcmp(f, "!NODE_ALLOC") == 0)
+ pkt_dev->flags &= ~F_NODE;
+
+ else if (strcmp(f, "UDPCSUM") == 0)
+ pkt_dev->flags |= F_UDPCSUM;
+
+ else if (strcmp(f, "!UDPCSUM") == 0)
+ pkt_dev->flags &= ~F_UDPCSUM;
+
+ else {
+ sprintf(pg_result,
+ "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
+ f,
+ "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
+ return count;
+ }
sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
- if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strncpy(pkt_dev->dst_min, buf, len);
- pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
- pkt_dev->cur_daddr = pkt_dev->daddr_min;
- }
- if(debug)
- printk("pktgen: dst_min set to: %s\n", pkt_dev->dst_min);
- i += len;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_min) != 0) {
+ memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
+ strncpy(pkt_dev->dst_min, buf, len);
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ }
+ if (debug)
+ pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
+ i += len;
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
}
if (!strcmp(name, "dst_max")) {
len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
- if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strncpy(pkt_dev->dst_max, buf, len);
- pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
- pkt_dev->cur_daddr = pkt_dev->daddr_max;
- }
- if(debug)
- printk("pktgen: dst_max set to: %s\n", pkt_dev->dst_max);
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_max) != 0) {
+ memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
+ strncpy(pkt_dev->dst_max, buf, len);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ pkt_dev->cur_daddr = pkt_dev->daddr_max;
+ }
+ if (debug)
+ pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
i += len;
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
}
if (!strcmp(name, "dst6")) {
len = strn_len(&user_buffer[i], sizeof(buf) - 1);
- if (len < 0) return len;
+ if (len < 0)
+ return len;
pkt_dev->flags |= F_IPV6;
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
+ buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
- if(debug)
- printk("pktgen: dst6 set to: %s\n", buf);
+ if (debug)
+ pr_debug("dst6 set to: %s\n", buf);
- i += len;
+ i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
return count;
}
if (!strcmp(name, "dst6_min")) {
len = strn_len(&user_buffer[i], sizeof(buf) - 1);
- if (len < 0) return len;
+ if (len < 0)
+ return len;
pkt_dev->flags |= F_IPV6;
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
+ buf[len] = 0;
- scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr);
- if(debug)
- printk("pktgen: dst6_min set to: %s\n", buf);
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
+ if (debug)
+ pr_debug("dst6_min set to: %s\n", buf);
- i += len;
+ i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
return count;
}
if (!strcmp(name, "dst6_max")) {
len = strn_len(&user_buffer[i], sizeof(buf) - 1);
- if (len < 0) return len;
+ if (len < 0)
+ return len;
pkt_dev->flags |= F_IPV6;
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
+ buf[len] = 0;
- scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
- fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
- if(debug)
- printk("pktgen: dst6_max set to: %s\n", buf);
+ if (debug)
+ pr_debug("dst6_max set to: %s\n", buf);
- i += len;
+ i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
return count;
}
if (!strcmp(name, "src6")) {
len = strn_len(&user_buffer[i], sizeof(buf) - 1);
- if (len < 0) return len;
+ if (len < 0)
+ return len;
pkt_dev->flags |= F_IPV6;
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
+ buf[len] = 0;
- scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
- fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+ in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
- if(debug)
- printk("pktgen: src6 set to: %s\n", buf);
-
- i += len;
+ if (debug)
+ pr_debug("src6 set to: %s\n", buf);
+
+ i += len;
sprintf(pg_result, "OK: src6=%s", buf);
return count;
}
if (!strcmp(name, "src_min")) {
len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
- if (len < 0) { return len; }
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
- if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strncpy(pkt_dev->src_min, buf, len);
- pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
- pkt_dev->cur_saddr = pkt_dev->saddr_min;
- }
- if(debug)
- printk("pktgen: src_min set to: %s\n", pkt_dev->src_min);
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_min) != 0) {
+ memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
+ strncpy(pkt_dev->src_min, buf, len);
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ }
+ if (debug)
+ pr_debug("src_min set to: %s\n", pkt_dev->src_min);
i += len;
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
}
if (!strcmp(name, "src_max")) {
len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
- if (len < 0) { return len; }
- if (copy_from_user(buf, &user_buffer[i], len))
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
- buf[len] = 0;
- if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strncpy(pkt_dev->src_max, buf, len);
- pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
- pkt_dev->cur_saddr = pkt_dev->saddr_max;
- }
- if(debug)
- printk("pktgen: src_max set to: %s\n", pkt_dev->src_max);
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_max) != 0) {
+ memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
+ strncpy(pkt_dev->src_max, buf, len);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ pkt_dev->cur_saddr = pkt_dev->saddr_max;
+ }
+ if (debug)
+ pr_debug("src_max set to: %s\n", pkt_dev->src_max);
i += len;
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- char *v = valstr;
- unsigned char old_dmac[ETH_ALEN];
- unsigned char *m = pkt_dev->dst_mac;
- memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN);
-
len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
memset(valstr, 0, sizeof(valstr));
- if( copy_from_user(valstr, &user_buffer[i], len))
+ if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
-
- for(*m = 0;*v && m < pkt_dev->dst_mac + 6; v++) {
- if (*v >= '0' && *v <= '9') {
- *m *= 16;
- *m += *v - '0';
- }
- if (*v >= 'A' && *v <= 'F') {
- *m *= 16;
- *m += *v - 'A' + 10;
- }
- if (*v >= 'a' && *v <= 'f') {
- *m *= 16;
- *m += *v - 'a' + 10;
- }
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->dst_mac))
+ return -EINVAL;
/* Set up Dest MAC */
- if (compare_ether_addr(old_dmac, pkt_dev->dst_mac))
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
-
- sprintf(pg_result, "OK: dstmac");
+ ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);
+
+ sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
return count;
}
if (!strcmp(name, "src_mac")) {
- char *v = valstr;
- unsigned char *m = pkt_dev->src_mac;
-
len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
memset(valstr, 0, sizeof(valstr));
- if( copy_from_user(valstr, &user_buffer[i], len))
+ if (copy_from_user(valstr, &user_buffer[i], len))
return -EFAULT;
- i += len;
- for(*m = 0;*v && m < pkt_dev->src_mac + 6; v++) {
- if (*v >= '0' && *v <= '9') {
- *m *= 16;
- *m += *v - '0';
- }
- if (*v >= 'A' && *v <= 'F') {
- *m *= 16;
- *m += *v - 'A' + 10;
- }
- if (*v >= 'a' && *v <= 'f') {
- *m *= 16;
- *m += *v - 'a' + 10;
- }
- if (*v == ':') {
- m++;
- *m = 0;
- }
- }
+ if (!mac_pton(valstr, pkt_dev->src_mac))
+ return -EINVAL;
+ /* Set up Src MAC */
+ ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);
- sprintf(pg_result, "OK: srcmac");
+ sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
return count;
}
- if (!strcmp(name, "clear_counters")) {
- pktgen_clear_counters(pkt_dev);
- sprintf(pg_result, "OK: Clearing counters.\n");
- return count;
- }
+ if (!strcmp(name, "clear_counters")) {
+ pktgen_clear_counters(pkt_dev);
+ sprintf(pg_result, "OK: Clearing counters.\n");
+ return count;
+ }
if (!strcmp(name, "flows")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
if (value > MAX_CFLOWS)
value = MAX_CFLOWS;
@@ -1271,270 +1484,549 @@ static ssize_t pktgen_if_write(struct file *file, const char __user *user_buffer
sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
return count;
}
+#ifdef CONFIG_XFRM
+ if (!strcmp(name, "spi")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+ i += len;
+ pkt_dev->spi = value;
+ sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
+ return count;
+ }
+#endif
if (!strcmp(name, "flowlen")) {
len = num_arg(&user_buffer[i], 10, &value);
- if (len < 0) { return len; }
+ if (len < 0)
+ return len;
+
i += len;
pkt_dev->lflow = value;
sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
return count;
}
-
+
+ if (!strcmp(name, "queue_map_min")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_min = value;
+ sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_max")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_max = value;
+ sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+ return count;
+ }
+
+ if (!strcmp(name, "mpls")) {
+ unsigned int n, cnt;
+
+ len = get_labels(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+ i += len;
+ cnt = sprintf(pg_result, "OK: mpls=");
+ for (n = 0; n < pkt_dev->nr_labels; n++)
+ cnt += sprintf(pg_result + cnt,
+ "%08x%s", ntohl(pkt_dev->labels[n]),
+ n == pkt_dev->nr_labels-1 ? "" : ",");
+
+ if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN auto turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value <= 4095) {
+ pkt_dev->vlan_id = value; /* turn on VLAN */
+
+ if (debug)
+ pr_debug("VLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_p = value;
+ sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_cfi = value;
+ sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
+ pkt_dev->svlan_id = value; /* turn on SVLAN */
+
+ if (debug)
+ pr_debug("SVLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_p = value;
+ sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_cfi = value;
+ sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "tos")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->tos = tmp_value;
+ sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
+ } else {
+ sprintf(pg_result, "ERROR: tos must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "traffic_class")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->traffic_class = tmp_value;
+ sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
+ } else {
+ sprintf(pg_result, "ERROR: traffic_class must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "skb_priority")) {
+ len = num_arg(&user_buffer[i], 9, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->skb_priority = value;
+ sprintf(pg_result, "OK: skb_priority=%i",
+ pkt_dev->skb_priority);
+ return count;
+ }
+
sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
return -EINVAL;
}
static int pktgen_if_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_if_show, PDE(inode)->data);
+ return single_open(file, pktgen_if_show, PDE_DATA(inode));
}
-static struct file_operations pktgen_if_fops = {
- .owner = THIS_MODULE,
- .open = pktgen_if_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_if_write,
- .release = single_release,
+static const struct file_operations pktgen_if_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_if_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_if_write,
+ .release = single_release,
};
static int pktgen_thread_show(struct seq_file *seq, void *v)
{
- struct pktgen_thread *t = seq->private;
- struct pktgen_dev *pkt_dev = NULL;
+ struct pktgen_thread *t = seq->private;
+ const struct pktgen_dev *pkt_dev;
BUG_ON(!t);
- seq_printf(seq, "Name: %s max_before_softirq: %d\n",
- t->name, t->max_before_softirq);
+ seq_puts(seq, "Running: ");
+
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
- seq_printf(seq, "Running: ");
-
- if_lock(t);
- for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
- if(pkt_dev->running)
- seq_printf(seq, "%s ", pkt_dev->ifname);
-
- seq_printf(seq, "\nStopped: ");
+ seq_puts(seq, "\nStopped: ");
- for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
- if(!pkt_dev->running)
- seq_printf(seq, "%s ", pkt_dev->ifname);
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (!pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
if (t->result[0])
seq_printf(seq, "\nResult: %s\n", t->result);
else
- seq_printf(seq, "\nResult: NA\n");
+ seq_puts(seq, "\nResult: NA\n");
- if_unlock(t);
+ if_unlock(t);
return 0;
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user *user_buffer,
- size_t count, loff_t *offset)
+ const char __user * user_buffer,
+ size_t count, loff_t * offset)
{
- struct seq_file *seq = (struct seq_file *) file->private_data;
- struct pktgen_thread *t = seq->private;
- int i = 0, max, len, ret;
+ struct seq_file *seq = file->private_data;
+ struct pktgen_thread *t = seq->private;
+ int i, max, len, ret;
char name[40];
- char *pg_result;
- unsigned long value = 0;
+ char *pg_result;
if (count < 1) {
- // sprintf(pg_result, "Wrong command format");
+ // sprintf(pg_result, "Wrong command format");
return -EINVAL;
}
- max = count - i;
- len = count_trail_chars(&user_buffer[i], max);
- if (len < 0)
+ max = count;
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0)
return len;
- i += len;
+ i = len;
/* Read variable name */
len = strn_len(&user_buffer[i], sizeof(name) - 1);
- if (len < 0)
+ if (len < 0)
return len;
-
+
memset(name, 0, sizeof(name));
if (copy_from_user(name, &user_buffer[i], len))
return -EFAULT;
i += len;
- max = count -i;
+ max = count - i;
len = count_trail_chars(&user_buffer[i], max);
- if (len < 0)
+ if (len < 0)
return len;
i += len;
if (debug)
- printk("pktgen: t=%s, count=%lu\n", name,
- (unsigned long) count);
+ pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);
- if(!t) {
- printk("pktgen: ERROR: No thread\n");
+ if (!t) {
+ pr_err("ERROR: No thread\n");
ret = -EINVAL;
goto out;
}
pg_result = &(t->result[0]);
- if (!strcmp(name, "add_device")) {
- char f[32];
- memset(f, 0, 32);
+ if (!strcmp(name, "add_device")) {
+ char f[32];
+ memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
- if (len < 0) {
- ret = len;
+ if (len < 0) {
+ ret = len;
goto out;
}
- if( copy_from_user(f, &user_buffer[i], len) )
+ if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
i += len;
- thread_lock();
- pktgen_add_device(t, f);
- thread_unlock();
- ret = count;
- sprintf(pg_result, "OK: add_device=%s", f);
+ mutex_lock(&pktgen_thread_lock);
+ ret = pktgen_add_device(t, f);
+ mutex_unlock(&pktgen_thread_lock);
+ if (!ret) {
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ } else
+ sprintf(pg_result, "ERROR: can not add device %s", f);
goto out;
}
- if (!strcmp(name, "rem_device_all")) {
- thread_lock();
- t->control |= T_REMDEV;
- thread_unlock();
- schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+ if (!strcmp(name, "rem_device_all")) {
+ mutex_lock(&pktgen_thread_lock);
+ t->control |= T_REMDEVALL;
+ mutex_unlock(&pktgen_thread_lock);
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
ret = count;
- sprintf(pg_result, "OK: rem_device_all");
+ sprintf(pg_result, "OK: rem_device_all");
goto out;
}
- if (!strcmp(name, "max_before_softirq")) {
- len = num_arg(&user_buffer[i], 10, &value);
- thread_lock();
- t->max_before_softirq = value;
- thread_unlock();
- ret = count;
- sprintf(pg_result, "OK: max_before_softirq=%lu", value);
+ if (!strcmp(name, "max_before_softirq")) {
+ sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
+ ret = count;
goto out;
}
ret = -EINVAL;
- out:
-
+out:
return ret;
}
static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_thread_show, PDE(inode)->data);
+ return single_open(file, pktgen_thread_show, PDE_DATA(inode));
}
-static struct file_operations pktgen_thread_fops = {
- .owner = THIS_MODULE,
- .open = pktgen_thread_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_thread_write,
- .release = single_release,
+static const struct file_operations pktgen_thread_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_thread_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_thread_write,
+ .release = single_release,
};
/* Think find or remove for NN */
-static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove)
+static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
+ const char *ifname, int remove)
{
struct pktgen_thread *t;
struct pktgen_dev *pkt_dev = NULL;
+ bool exact = (remove == FIND);
- t = pktgen_threads;
-
- while (t) {
- pkt_dev = pktgen_find_dev(t, ifname);
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ pkt_dev = pktgen_find_dev(t, ifname, exact);
if (pkt_dev) {
- if(remove) {
- if_lock(t);
- pktgen_remove_device(t, pkt_dev);
- if_unlock(t);
- }
+ if (remove) {
+ if_lock(t);
+ pkt_dev->removal_mark = 1;
+ t->control |= T_REMDEV;
+ if_unlock(t);
+ }
break;
}
- t = t->next;
}
- return pkt_dev;
+ return pkt_dev;
}
-static struct pktgen_dev *pktgen_NN_threads(const char* ifname, int remove)
+/*
+ * mark a device for removal
+ */
+static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
{
struct pktgen_dev *pkt_dev = NULL;
- thread_lock();
- pkt_dev = __pktgen_NN_threads(ifname, remove);
- thread_unlock();
- return pkt_dev;
+ const int max_tries = 10, msec_per_try = 125;
+ int i = 0;
+
+ mutex_lock(&pktgen_thread_lock);
+ pr_debug("%s: marking %s for removal\n", __func__, ifname);
+
+ while (1) {
+
+ pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);
+ if (pkt_dev == NULL)
+ break; /* success */
+
+ mutex_unlock(&pktgen_thread_lock);
+ pr_debug("%s: waiting for %s to disappear....\n",
+ __func__, ifname);
+ schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ mutex_lock(&pktgen_thread_lock);
+
+ if (++i >= max_tries) {
+ pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+ __func__, msec_per_try * i, ifname);
+ break;
+ }
+
+ }
+
+ mutex_unlock(&pktgen_thread_lock);
}
-static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)
{
- struct net_device *dev = (struct net_device *)(ptr);
+ struct pktgen_thread *t;
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ struct pktgen_dev *pkt_dev;
+
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (pkt_dev->odev != dev)
+ continue;
+
+ proc_remove(pkt_dev->entry);
+
+ pkt_dev->entry = proc_create_data(dev->name, 0600,
+ pn->proc_dir,
+ &pktgen_if_fops,
+ pkt_dev);
+ if (!pkt_dev->entry)
+ pr_err("can't move proc entry for '%s'\n",
+ dev->name);
+ break;
+ }
+ }
+}
+
+static int pktgen_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id);
+
+ if (pn->pktgen_exiting)
+ return NOTIFY_DONE;
/* It is OK that we do not hold the group lock right now,
* as we run under the RTNL lock.
*/
switch (event) {
- case NETDEV_CHANGEADDR:
- case NETDEV_GOING_DOWN:
- case NETDEV_DOWN:
- case NETDEV_UP:
- /* Ignore for now */
+ case NETDEV_CHANGENAME:
+ pktgen_change_name(pn, dev);
break;
-
+
case NETDEV_UNREGISTER:
- pktgen_NN_threads(dev->name, REMOVE);
+ pktgen_mark_device(pn, dev->name);
break;
- };
+ }
return NOTIFY_DONE;
}
+static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev,
+ const char *ifname)
+{
+ char b[IFNAMSIZ+5];
+ int i;
+
+ for (i = 0; ifname[i] != '@'; i++) {
+ if (i == IFNAMSIZ)
+ break;
+
+ b[i] = ifname[i];
+ }
+ b[i] = 0;
+
+ return dev_get_by_name(pn->net, b);
+}
+
+
/* Associate pktgen_dev with a device. */
-static struct net_device* pktgen_setup_dev(struct pktgen_dev *pkt_dev) {
+static int pktgen_setup_dev(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev, const char *ifname)
+{
struct net_device *odev;
+ int err;
/* Clean old setups */
-
if (pkt_dev->odev) {
dev_put(pkt_dev->odev);
- pkt_dev->odev = NULL;
- }
-
- odev = dev_get_by_name(pkt_dev->ifname);
+ pkt_dev->odev = NULL;
+ }
+ odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);
if (!odev) {
- printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname);
- goto out;
+ pr_err("no such netdevice: \"%s\"\n", ifname);
+ return -ENODEV;
}
+
if (odev->type != ARPHRD_ETHER) {
- printk("pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname);
- goto out_put;
- }
- if (!netif_running(odev)) {
- printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname);
- goto out_put;
+ pr_err("not an ethernet device: \"%s\"\n", ifname);
+ err = -EINVAL;
+ } else if (!netif_running(odev)) {
+ pr_err("device is down: \"%s\"\n", ifname);
+ err = -ENETDOWN;
+ } else {
+ pkt_dev->odev = odev;
+ return 0;
}
- pkt_dev->odev = odev;
-
- return pkt_dev->odev;
-out_put:
dev_put(odev);
-out:
- return NULL;
-
+ return err;
}
/* Read pkt_dev from the interface and set up internal pktgen_dev
@@ -1542,89 +2034,109 @@ out:
*/
static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
{
- /* Try once more, just in case it works now. */
- if (!pkt_dev->odev)
- pktgen_setup_dev(pkt_dev);
-
- if (!pkt_dev->odev) {
- printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n");
- sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
- return;
- }
-
- /* Default to the interface's mac if not explicitly set. */
+ int ntxq;
- if (is_zero_ether_addr(pkt_dev->src_mac))
- memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN);
+ if (!pkt_dev->odev) {
+ pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
+ sprintf(pkt_dev->result,
+ "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+ return;
+ }
- /* Set up Dest MAC */
- memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+ /* make sure that we don't pick a non-existing transmit queue */
+ ntxq = pkt_dev->odev->real_num_tx_queues;
- /* Set up pkt size */
- pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
-
- if(pkt_dev->flags & F_IPV6) {
- /*
- * Skip this automatic address setting until locks or functions
- * gets exported
- */
+ if (ntxq <= pkt_dev->queue_map_min) {
+ pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
+ }
+ if (pkt_dev->queue_map_max >= ntxq) {
+ pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
+ }
-#ifdef NOTNOW
- int i, set = 0, err=1;
+ /* Default to the interface's mac if not explicitly set. */
+
+ if (is_zero_ether_addr(pkt_dev->src_mac))
+ ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);
+
+ /* Set up Dest MAC */
+ ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);
+
+ if (pkt_dev->flags & F_IPV6) {
+ int i, set = 0, err = 1;
struct inet6_dev *idev;
- for(i=0; i< IN6_ADDR_HSIZE; i++)
- if(pkt_dev->cur_in6_saddr.s6_addr[i]) {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
set = 1;
break;
}
- if(!set) {
-
+ if (!set) {
+
/*
* Use linklevel address if unconfigured.
*
* use ipv6_get_lladdr if/when it's get exported
*/
-
- read_lock(&addrconf_lock);
- if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) {
+ rcu_read_lock();
+ idev = __in6_dev_get(pkt_dev->odev);
+ if (idev) {
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
- if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &ifp->addr);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if ((ifp->scope & IFA_LINK) &&
+ !(ifp->flags & IFA_F_TENTATIVE)) {
+ pkt_dev->cur_in6_saddr = ifp->addr;
err = 0;
break;
}
}
read_unlock_bh(&idev->lock);
}
- read_unlock(&addrconf_lock);
- if(err) printk("pktgen: ERROR: IPv6 link address not availble.\n");
+ rcu_read_unlock();
+ if (err)
+ pr_err("ERROR: IPv6 link address not available\n");
}
-#endif
- }
- else {
+ } else {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
pkt_dev->saddr_min = 0;
pkt_dev->saddr_max = 0;
if (strlen(pkt_dev->src_min) == 0) {
-
- struct in_device *in_dev;
+
+ struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
if (in_dev->ifa_list) {
- pkt_dev->saddr_min = in_dev->ifa_list->ifa_address;
+ pkt_dev->saddr_min =
+ in_dev->ifa_list->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
rcu_read_unlock();
- }
- else {
+ } else {
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
}
@@ -1632,167 +2144,308 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
}
- /* Initialize current values. */
- pkt_dev->cur_dst_mac_offset = 0;
- pkt_dev->cur_src_mac_offset = 0;
- pkt_dev->cur_saddr = pkt_dev->saddr_min;
- pkt_dev->cur_daddr = pkt_dev->daddr_min;
- pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
- pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ /* Initialize current values. */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+ pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
+ pkt_dev->cur_dst_mac_offset = 0;
+ pkt_dev->cur_src_mac_offset = 0;
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
pkt_dev->nflows = 0;
}
-static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us)
-{
- __u64 start;
- __u64 now;
- start = now = getCurUs();
- printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now));
- while (now < spin_until_us) {
- /* TODO: optimize sleeping behavior */
- if (spin_until_us - now > jiffies_to_usecs(1)+1)
- schedule_timeout_interruptible(1);
- else if (spin_until_us - now > 100) {
- do_softirq();
- if (!pkt_dev->running)
- return;
- if (need_resched())
+static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
+{
+ ktime_t start_time, end_time;
+ s64 remaining;
+ struct hrtimer_sleeper t;
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires(&t.timer, spin_until);
+
+ remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
+ if (remaining <= 0) {
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ return;
+ }
+
+ start_time = ktime_get();
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_get();
+ } while (ktime_compare(end_time, spin_until) < 0);
+ } else {
+ /* see do_nanosleep */
+ hrtimer_init_sleeper(&t, current);
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
schedule();
+
+ hrtimer_cancel(&t.timer);
+ } while (t.task && pkt_dev->running && !signal_pending(current));
+ __set_current_state(TASK_RUNNING);
+ end_time = ktime_get();
+ }
+
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+ pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow)
+{
+ return !!(pkt_dev->flows[flow].flags & F_INIT);
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+ int flow = pkt_dev->curfl;
+
+ if (pkt_dev->flags & F_FLOW_SEQ) {
+ if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+ /* reset time */
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ pkt_dev->curfl += 1;
+ if (pkt_dev->curfl >= pkt_dev->cflows)
+ pkt_dev->curfl = 0; /*reset */
}
+ } else {
+ flow = prandom_u32() % pkt_dev->cflows;
+ pkt_dev->curfl = flow;
- now = getCurUs();
+ if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ }
}
- pkt_dev->idle_acc += now - start;
+ return pkt_dev->curfl;
}
+#ifdef CONFIG_XFRM
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+*/
+#define DUMMY_MARK 0
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+ struct xfrm_state *x = pkt_dev->flows[flow].x;
+ struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+ if (!x) {
+
+ if (pkt_dev->spi) {
+ /* We need as quick as possible to find the right SA
+ * Searching with minimum criteria to archieve this.
+ */
+ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
+ } else {
+ /* slow path: we dont already have xfrm_state */
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ (xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ }
+ if (x) {
+ pkt_dev->flows[flow].x = x;
+ set_pkt_overhead(pkt_dev);
+ pkt_dev->pkt_overhead += x->props.header_len;
+ }
+
+ }
+}
+#endif
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ pkt_dev->cur_queue_map = smp_processor_id();
+
+ else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
+ __u16 t;
+ if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+ t = prandom_u32() %
+ (pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1)
+ + pkt_dev->queue_map_min;
+ } else {
+ t = pkt_dev->cur_queue_map + 1;
+ if (t > pkt_dev->queue_map_max)
+ t = pkt_dev->queue_map_min;
+ }
+ pkt_dev->cur_queue_map = t;
+ }
+ pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
+}
+
/* Increment/randomize headers according to flags and current values
* for IP src/dest, UDP src/dst port, MAC-Addr src/dst
*/
-static void mod_cur_headers(struct pktgen_dev *pkt_dev) {
- __u32 imn;
- __u32 imx;
- int flow = 0;
-
- if(pkt_dev->cflows) {
- flow = pktgen_random() % pkt_dev->cflows;
-
- if (pkt_dev->flows[flow].count > pkt_dev->lflow)
- pkt_dev->flows[flow].count = 0;
- }
+static void mod_cur_headers(struct pktgen_dev *pkt_dev)
+{
+ __u32 imn;
+ __u32 imx;
+ int flow = 0;
+ if (pkt_dev->cflows)
+ flow = f_pick(pkt_dev);
/* Deal with source MAC */
- if (pkt_dev->src_mac_count > 1) {
- __u32 mc;
- __u32 tmp;
-
- if (pkt_dev->flags & F_MACSRC_RND)
- mc = pktgen_random() % (pkt_dev->src_mac_count);
- else {
- mc = pkt_dev->cur_src_mac_offset++;
- if (pkt_dev->cur_src_mac_offset > pkt_dev->src_mac_count)
- pkt_dev->cur_src_mac_offset = 0;
- }
-
- tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
- pkt_dev->hh[11] = tmp;
- tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[10] = tmp;
- tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[9] = tmp;
- tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[8] = tmp;
- tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
- pkt_dev->hh[7] = tmp;
- }
-
- /* Deal with Destination MAC */
- if (pkt_dev->dst_mac_count > 1) {
- __u32 mc;
- __u32 tmp;
-
- if (pkt_dev->flags & F_MACDST_RND)
- mc = pktgen_random() % (pkt_dev->dst_mac_count);
-
- else {
- mc = pkt_dev->cur_dst_mac_offset++;
- if (pkt_dev->cur_dst_mac_offset > pkt_dev->dst_mac_count) {
- pkt_dev->cur_dst_mac_offset = 0;
- }
- }
-
- tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
- pkt_dev->hh[5] = tmp;
- tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[4] = tmp;
- tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[3] = tmp;
- tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
- pkt_dev->hh[2] = tmp;
- tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
- pkt_dev->hh[1] = tmp;
- }
-
- if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
- if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = ((pktgen_random() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + pkt_dev->udp_src_min);
-
- else {
+ if (pkt_dev->src_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ mc = prandom_u32() % pkt_dev->src_mac_count;
+ else {
+ mc = pkt_dev->cur_src_mac_offset++;
+ if (pkt_dev->cur_src_mac_offset >=
+ pkt_dev->src_mac_count)
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+
+ tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[11] = tmp;
+ tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[10] = tmp;
+ tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[9] = tmp;
+ tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[8] = tmp;
+ tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
+ pkt_dev->hh[7] = tmp;
+ }
+
+ /* Deal with Destination MAC */
+ if (pkt_dev->dst_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ mc = prandom_u32() % pkt_dev->dst_mac_count;
+
+ else {
+ mc = pkt_dev->cur_dst_mac_offset++;
+ if (pkt_dev->cur_dst_mac_offset >=
+ pkt_dev->dst_mac_count) {
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ }
+
+ tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[5] = tmp;
+ tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[4] = tmp;
+ tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[3] = tmp;
+ tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[2] = tmp;
+ tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
+ pkt_dev->hh[1] = tmp;
+ }
+
+ if (pkt_dev->flags & F_MPLS_RND) {
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
+ pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
+ ((__force __be32)prandom_u32() &
+ htonl(0x000fffff));
+ }
+
+ if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ pkt_dev->cur_udp_src = prandom_u32() %
+ (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+ + pkt_dev->udp_src_min;
+
+ else {
pkt_dev->cur_udp_src++;
if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
- }
- }
-
- if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
- if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = ((pktgen_random() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + pkt_dev->udp_dst_min);
- }
- else {
+ }
+ }
+
+ if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
+ if (pkt_dev->flags & F_UDPDST_RND) {
+ pkt_dev->cur_udp_dst = prandom_u32() %
+ (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+ + pkt_dev->udp_dst_min;
+ } else {
pkt_dev->cur_udp_dst++;
- if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
+ if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
- }
- }
+ }
+ }
if (!(pkt_dev->flags & F_IPV6)) {
- if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = ntohl(pkt_dev->saddr_max))) {
+ imn = ntohl(pkt_dev->saddr_min);
+ imx = ntohl(pkt_dev->saddr_max);
+ if (imn < imx) {
__u32 t;
- if (pkt_dev->flags & F_IPSRC_RND)
- t = ((pktgen_random() % (imx - imn)) + imn);
+ if (pkt_dev->flags & F_IPSRC_RND)
+ t = prandom_u32() % (imx - imn) + imn;
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
- if (t > imx) {
+ if (t > imx)
t = imn;
- }
+
}
pkt_dev->cur_saddr = htonl(t);
}
-
- if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) {
+
+ if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
} else {
-
- if ((imn = ntohl(pkt_dev->daddr_min)) < (imx = ntohl(pkt_dev->daddr_max))) {
+ imn = ntohl(pkt_dev->daddr_min);
+ imx = ntohl(pkt_dev->daddr_max);
+ if (imn < imx) {
__u32 t;
+ __be32 s;
if (pkt_dev->flags & F_IPDST_RND) {
- t = ((pktgen_random() % (imx - imn)) + imn);
- t = htonl(t);
-
- while( LOOPBACK(t) || MULTICAST(t) || BADCLASS(t) || ZERONET(t) || LOCAL_MCAST(t) ) {
- t = ((pktgen_random() % (imx - imn)) + imn);
- t = htonl(t);
- }
- pkt_dev->cur_daddr = t;
- }
-
- else {
+ do {
+ t = prandom_u32() %
+ (imx - imn) + imn;
+ s = htonl(t);
+ } while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s));
+ pkt_dev->cur_daddr = s;
+ } else {
t = ntohl(pkt_dev->cur_daddr);
t++;
if (t > imx) {
@@ -1801,1254 +2454,1378 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) {
pkt_dev->cur_daddr = htonl(t);
}
}
- if(pkt_dev->cflows) {
- pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr;
+ if (pkt_dev->cflows) {
+ pkt_dev->flows[flow].flags |= F_INIT;
+ pkt_dev->flows[flow].cur_daddr =
+ pkt_dev->cur_daddr;
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON)
+ get_ipsec_sa(pkt_dev, flow);
+#endif
pkt_dev->nflows++;
}
}
- }
- else /* IPV6 * */
- {
- if(pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
- pkt_dev->min_in6_daddr.s6_addr32[3] == 0);
- else {
+ } else { /* IPV6 * */
+
+ if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
int i;
/* Only random destinations yet */
- for(i=0; i < 4; i++) {
+ for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- ((pktgen_random() |
- pkt_dev->min_in6_daddr.s6_addr32[i]) &
- pkt_dev->max_in6_daddr.s6_addr32[i]);
+ (((__force __be32)prandom_u32() |
+ pkt_dev->min_in6_daddr.s6_addr32[i]) &
+ pkt_dev->max_in6_daddr.s6_addr32[i]);
}
- }
+ }
}
- if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
- __u32 t;
- if (pkt_dev->flags & F_TXSIZE_RND) {
- t = ((pktgen_random() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size))
- + pkt_dev->min_pkt_size);
- }
- else {
+ if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
+ __u32 t;
+ if (pkt_dev->flags & F_TXSIZE_RND) {
+ t = prandom_u32() %
+ (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+ + pkt_dev->min_pkt_size;
+ } else {
t = pkt_dev->cur_pkt_size + 1;
- if (t > pkt_dev->max_pkt_size)
+ if (t > pkt_dev->max_pkt_size)
t = pkt_dev->min_pkt_size;
- }
- pkt_dev->cur_pkt_size = t;
- }
+ }
+ pkt_dev->cur_pkt_size = t;
+ }
+
+ set_cur_queue_map(pkt_dev);
pkt_dev->flows[flow].count++;
}
-static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
- struct pktgen_dev *pkt_dev)
+#ifdef CONFIG_XFRM
+static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
+
+ [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */
+};
+
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
{
- struct sk_buff *skb = NULL;
- __u8 *eth;
- struct udphdr *udph;
- int datalen, iplen;
- struct iphdr *iph;
- struct pktgen_hdr *pgh = NULL;
-
- /* Update any of the values, used when we're incrementing various
- * fields.
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int err = 0;
+ struct net *net = dev_net(pkt_dev->odev);
+
+ if (!x)
+ return 0;
+ /* XXX: we dont support tunnel mode for now until
+ * we resolve the dst issue */
+ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
+ return 0;
+
+ /* But when user specify an valid SPI, transformation
+ * supports both transport/tunnel mode + ESP/AH type.
*/
- mod_cur_headers(pkt_dev);
+ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
+ skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+
+ rcu_read_lock_bh();
+ err = x->outer_mode->output(x, skb);
+ rcu_read_unlock_bh();
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+ goto error;
+ }
+ err = x->type->output(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ goto error;
+ }
+ spin_lock_bh(&x->lock);
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+ spin_unlock_bh(&x->lock);
+error:
+ return err;
+}
- datalen = (odev->hard_header_len + 16) & ~0xf;
- skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen, GFP_ATOMIC);
- if (!skb) {
- sprintf(pkt_dev->result, "No memory");
- return NULL;
+static void free_SAs(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->cflows) {
+ /* let go of the SAs if we have them */
+ int i;
+ for (i = 0; i < pkt_dev->cflows; i++) {
+ struct xfrm_state *x = pkt_dev->flows[i].x;
+ if (x) {
+ xfrm_state_put(x);
+ pkt_dev->flows[i].x = NULL;
+ }
+ }
}
+}
- skb_reserve(skb, datalen);
+static int process_ipsec(struct pktgen_dev *pkt_dev,
+ struct sk_buff *skb, __be16 protocol)
+{
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int nhead = 0;
+ if (x) {
+ int ret;
+ __u8 *eth;
+ struct iphdr *iph;
+
+ nhead = x->props.header_len - skb_headroom(skb);
+ if (nhead > 0) {
+ ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("Error expanding ipsec packet %d\n",
+ ret);
+ goto err;
+ }
+ }
- /* Reserve for ethernet and IP header */
- eth = (__u8 *) skb_push(skb, 14);
- iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
- udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
+ /* ipsec is not expecting ll header */
+ skb_pull(skb, ETH_HLEN);
+ ret = pktgen_output_ipsec(skb, pkt_dev);
+ if (ret) {
+ pr_err("Error creating ipsec packet %d\n", ret);
+ goto err;
+ }
+ /* restore ll */
+ eth = (__u8 *) skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 12);
+ *(u16 *) &eth[12] = protocol;
+
+ /* Update IPv4 header len as well as checksum value */
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len - ETH_HLEN);
+ ip_send_check(iph);
+ }
+ }
+ return 1;
+err:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
- memcpy(eth, pkt_dev->hh, 12);
- *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
+static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
+{
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
- datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8; /* Eth + IPh + UDPh */
- if (datalen < sizeof(struct pktgen_hdr))
- datalen = sizeof(struct pktgen_hdr);
-
- udph->source = htons(pkt_dev->cur_udp_src);
- udph->dest = htons(pkt_dev->cur_udp_dst);
- udph->len = htons(datalen + 8); /* DATA + udphdr */
- udph->check = 0; /* No checksum */
+ mpls--;
+ *mpls |= MPLS_STACK_BOTTOM;
+}
- iph->ihl = 5;
- iph->version = 4;
- iph->ttl = 32;
- iph->tos = 0;
- iph->protocol = IPPROTO_UDP; /* UDP */
- iph->saddr = pkt_dev->cur_saddr;
- iph->daddr = pkt_dev->cur_daddr;
- iph->frag_off = 0;
- iplen = 20 + 8 + datalen;
- iph->tot_len = htons(iplen);
- iph->check = 0;
- iph->check = ip_fast_csum((void *) iph, iph->ihl);
- skb->protocol = __constant_htons(ETH_P_IP);
- skb->mac.raw = ((u8 *)iph) - 14;
- skb->dev = odev;
- skb->pkt_type = PACKET_HOST;
+static inline __be16 build_tci(unsigned int id, unsigned int cfi,
+ unsigned int prio)
+{
+ return htons(id | (cfi << 12) | (prio << 13));
+}
+
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timeval timestamp;
+ struct pktgen_hdr *pgh;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
+ pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ memset(skb_put(skb, datalen), 0, datalen);
+ } else {
int frags = pkt_dev->nfrags;
- int i;
+ int i, len;
+ int frag_len;
+
- pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8);
-
if (frags > MAX_SKB_FRAGS)
frags = MAX_SKB_FRAGS;
- if (datalen > frags*PAGE_SIZE) {
- skb_put(skb, datalen-frags*PAGE_SIZE);
- datalen = frags*PAGE_SIZE;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ memset(skb_put(skb, len), 0, len);
+ datalen = frags * PAGE_SIZE;
}
i = 0;
+ frag_len = (datalen/frags) < PAGE_SIZE ?
+ (datalen/frags) : PAGE_SIZE;
while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ get_page(pkt_dev->page);
+ skb_frag_set_page(skb, i, pkt_dev->page);
skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
+ /*last fragment, fill rest of data*/
+ if (i == (frags - 1))
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i],
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ else
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
i++;
skb_shinfo(skb)->nr_frags = i;
}
+ }
- while (i < frags) {
- int rem;
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
- if (i == 0)
- break;
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+}
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
+static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
+ struct pktgen_dev *pkt_dev,
+ unsigned int extralen)
+{
+ struct sk_buff *skb = NULL;
+ unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
+ pkt_dev->pkt_overhead;
- skb_shinfo(skb)->frags[i - 1].size -= rem;
+ if (pkt_dev->flags & F_NODE) {
+ int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
- skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
+ skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
}
+ } else {
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
- /* Stamp the time, and sequence number, convert them to network byte order */
-
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
- pkt_dev->seq_num++;
-
return skb;
}
-/*
- * scan_ip6, fmt_ip taken from dietlibc-0.21
- * Author Felix von Leitner <felix-dietlibc@fefe.de>
- *
- * Slightly modified for kernel.
- * Should be candidate for net/ipv4/utils.c
- * --ro
- */
-
-static unsigned int scan_ip6(const char *s,char ip[16])
+static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
{
- unsigned int i;
- unsigned int len=0;
- unsigned long u;
- char suffix[16];
- unsigned int prefixlen=0;
- unsigned int suffixlen=0;
- __u32 tmp;
-
- for (i=0; i<16; i++) ip[i]=0;
-
- for (;;) {
- if (*s == ':') {
- len++;
- if (s[1] == ':') { /* Found "::", skip to part 2 */
- s+=2;
- len++;
- break;
- }
- s++;
- }
- {
- char *tmp;
- u=simple_strtoul(s,&tmp,16);
- i=tmp-s;
- }
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, iplen;
+ struct iphdr *iph;
+ __be16 protocol = htons(ETH_P_IP);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
- if (!i) return 0;
- if (prefixlen==12 && s[i]=='.') {
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
- /* the last 4 bytes may be written as IPv4 address */
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
- tmp = in_aton(s);
- memcpy((struct in_addr*)(ip+12), &tmp, sizeof(tmp));
- return i+len;
- }
- ip[prefixlen++] = (u >> 8);
- ip[prefixlen++] = (u & 255);
- s += i; len += i;
- if (prefixlen==16)
- return len;
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+
+ datalen = (odev->hard_header_len + 16) & ~0xf;
+
+ skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
}
-/* part 2, after "::" */
- for (;;) {
- if (*s == ':') {
- if (suffixlen==0)
- break;
- s++;
- len++;
- } else if (suffixlen!=0)
- break;
- {
- char *tmp;
- u=simple_strtol(s,&tmp,16);
- i=tmp-s;
- }
- if (!i) {
- if (*s) len--;
- break;
- }
- if (suffixlen+prefixlen<=12 && s[i]=='.') {
- tmp = in_aton(s);
- memcpy((struct in_addr*)(suffix+suffixlen), &tmp, sizeof(tmp));
- suffixlen+=4;
- len+=strlen(s);
- break;
+ prefetchw(skb->data);
+ skb_reserve(skb, datalen);
+
+ /* Reserve for ethernet and IP header */
+ eth = (__u8 *) skb_push(skb, 14);
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
}
- suffix[suffixlen++] = (u >> 8);
- suffix[suffixlen++] = (u & 255);
- s += i; len += i;
- if (prefixlen+suffixlen==16)
- break;
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IP);
}
- for (i=0; i<suffixlen; i++)
- ip[16-suffixlen+i] = suffix[i];
- return len;
-}
-static char tohex(char hexdigit) {
- return hexdigit>9?hexdigit+'a'-10:hexdigit+'0';
-}
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
-static int fmt_xlong(char* s,unsigned int i) {
- char* bak=s;
- *s=tohex((i>>12)&0xf); if (s!=bak || *s!='0') ++s;
- *s=tohex((i>>8)&0xf); if (s!=bak || *s!='0') ++s;
- *s=tohex((i>>4)&0xf); if (s!=bak || *s!='0') ++s;
- *s=tohex(i&0xf);
- return s-bak+1;
-}
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
-static unsigned int fmt_ip6(char *s,const char ip[16]) {
- unsigned int len;
- unsigned int i;
- unsigned int temp;
- unsigned int compressing;
- int j;
-
- len = 0; compressing = 0;
- for (j=0; j<16; j+=2) {
-
-#ifdef V4MAPPEDPREFIX
- if (j==12 && !memcmp(ip,V4mappedprefix,12)) {
- inet_ntoa_r(*(struct in_addr*)(ip+12),s);
- temp=strlen(s);
- return len+temp;
- }
-#endif
- temp = ((unsigned long) (unsigned char) ip[j] << 8) +
- (unsigned long) (unsigned char) ip[j+1];
- if (temp == 0) {
- if (!compressing) {
- compressing=1;
- if (j==0) {
- *s++=':'; ++len;
- }
- }
- } else {
- if (compressing) {
- compressing=0;
- *s++=':'; ++len;
- }
- i = fmt_xlong(s,temp); len += i; s += i;
- if (j<14) {
- *s++ = ':';
- ++len;
- }
- }
- }
- if (compressing) {
- *s++=':'; ++len;
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) & eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
+ pkt_dev->pkt_overhead;
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))
+ datalen = sizeof(struct pktgen_hdr);
+
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(datalen + 8); /* DATA + udphdr */
+ udph->check = 0;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 32;
+ iph->tos = pkt_dev->tos;
+ iph->protocol = IPPROTO_UDP; /* UDP */
+ iph->saddr = pkt_dev->cur_saddr;
+ iph->daddr = pkt_dev->cur_daddr;
+ iph->id = htons(pkt_dev->ip_id);
+ pkt_dev->ip_id++;
+ iph->frag_off = 0;
+ iplen = 20 + 8 + datalen;
+ iph->tot_len = htons(iplen);
+ ip_send_check(iph);
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+ udp4_hwcsum(skb, udph->source, udph->dest);
+ } else {
+ __wsum csum = udp_csum(skb);
+
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_tcpudp_magic(udph->source, udph->dest,
+ datalen + 8, IPPROTO_UDP, csum);
+
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- *s=0;
- return len;
+
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+#ifdef CONFIG_XFRM
+ if (!process_ipsec(pkt_dev, skb, protocol))
+ return NULL;
+#endif
+
+ return skb;
}
-static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
- struct pktgen_dev *pkt_dev)
+static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
{
struct sk_buff *skb = NULL;
__u8 *eth;
struct udphdr *udph;
- int datalen;
+ int datalen, udplen;
struct ipv6hdr *iph;
- struct pktgen_hdr *pgh = NULL;
+ __be16 protocol = htons(ETH_P_IPV6);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
/* Update any of the values, used when we're incrementing various
* fields.
*/
mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
- skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
+ skb = pktgen_alloc_skb(odev, pkt_dev, 16);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
+ prefetchw(skb->data);
skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
eth = (__u8 *) skb_push(skb, 14);
- iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
- udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IPV6);
+ }
+
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
+ *(__be16 *) &eth[12] = protocol;
- datalen = pkt_dev->cur_pkt_size-14-
- sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 -
+ sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
+ pkt_dev->pkt_overhead;
- if (datalen < sizeof(struct pktgen_hdr)) {
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
datalen = sizeof(struct pktgen_hdr);
- if (net_ratelimit())
- printk(KERN_INFO "pktgen: increased datalen to %d\n", datalen);
+ net_info_ratelimited("increased datalen to %d\n", datalen);
}
+ udplen = datalen + sizeof(struct udphdr);
udph->source = htons(pkt_dev->cur_udp_src);
udph->dest = htons(pkt_dev->cur_udp_dst);
- udph->len = htons(datalen + sizeof(struct udphdr));
- udph->check = 0; /* No checksum */
+ udph->len = htons(udplen);
+ udph->check = 0;
+
+ *(__be32 *) iph = htonl(0x60000000); /* Version + flow */
- *(u32*)iph = __constant_htonl(0x60000000); /* Version + flow */
+ if (pkt_dev->traffic_class) {
+ /* Version + traffic class + flow (0) */
+ *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20));
+ }
iph->hop_limit = 32;
- iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+ iph->payload_len = htons(udplen);
iph->nexthdr = IPPROTO_UDP;
- ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
- ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
- skb->mac.raw = ((u8 *)iph) - 14;
- skb->protocol = __constant_htons(ETH_P_IPV6);
+ skb->protocol = protocol;
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
-
- pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags*PAGE_SIZE) {
- skb_put(skb, datalen-frags*PAGE_SIZE);
- datalen = frags*PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V6_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+ } else {
+ __wsum csum = udp_csum(skb);
- skb_shinfo(skb)->frags[i - 1].size -= rem;
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
- skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
}
- /* Stamp the time, and sequence number, convert them to network byte order */
- /* should we update cloned packets too ? */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
- pkt_dev->seq_num++;
-
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
return skb;
}
-static inline struct sk_buff *fill_packet(struct net_device *odev,
+static struct sk_buff *fill_packet(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
- if(pkt_dev->flags & F_IPV6)
+ if (pkt_dev->flags & F_IPV6)
return fill_packet_ipv6(odev, pkt_dev);
else
return fill_packet_ipv4(odev, pkt_dev);
}
-static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
{
- pkt_dev->seq_num = 1;
- pkt_dev->idle_acc = 0;
+ pkt_dev->seq_num = 1;
+ pkt_dev->idle_acc = 0;
pkt_dev->sofar = 0;
- pkt_dev->tx_bytes = 0;
- pkt_dev->errors = 0;
+ pkt_dev->tx_bytes = 0;
+ pkt_dev->errors = 0;
}
/* Set up structure for sending pkts, clear counters */
static void pktgen_run(struct pktgen_thread *t)
{
- struct pktgen_dev *pkt_dev = NULL;
+ struct pktgen_dev *pkt_dev;
int started = 0;
- PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t));
+ func_enter();
if_lock(t);
- for (pkt_dev = t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) {
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
/*
* setup odev and create initial packet.
*/
pktgen_setup_inject(pkt_dev);
- if(pkt_dev->odev) {
+ if (pkt_dev->odev) {
pktgen_clear_counters(pkt_dev);
- pkt_dev->running = 1; /* Cranke yeself! */
+ pkt_dev->running = 1; /* Cranke yeself! */
pkt_dev->skb = NULL;
- pkt_dev->started_at = getCurUs();
- pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */
- pkt_dev->next_tx_ns = 0;
-
+ pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
+
+ set_pkt_overhead(pkt_dev);
+
strcpy(pkt_dev->result, "Starting");
started++;
- }
- else
+ } else
strcpy(pkt_dev->result, "Error starting");
}
if_unlock(t);
- if(started) t->control &= ~(T_STOP);
+ if (started)
+ t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(void)
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
{
- struct pktgen_thread *t = pktgen_threads;
+ struct pktgen_thread *t;
- PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads.\n"));
+ func_enter();
- thread_lock();
- while(t) {
- pktgen_stop(t);
- t = t->next;
- }
- thread_unlock();
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= T_STOP;
+
+ mutex_unlock(&pktgen_thread_lock);
}
-static int thread_is_running(struct pktgen_thread *t )
+static int thread_is_running(const struct pktgen_thread *t)
{
- struct pktgen_dev *next;
- int res = 0;
+ const struct pktgen_dev *pkt_dev;
- for(next=t->if_list; next; next=next->next) {
- if(next->running) {
- res = 1;
- break;
- }
- }
- return res;
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running)
+ return 1;
+ return 0;
}
-static int pktgen_wait_thread_run(struct pktgen_thread *t )
+static int pktgen_wait_thread_run(struct pktgen_thread *t)
{
- if_lock(t);
+ if_lock(t);
- while(thread_is_running(t)) {
+ while (thread_is_running(t)) {
- if_unlock(t);
+ if_unlock(t);
- msleep_interruptible(100);
+ msleep_interruptible(100);
- if (signal_pending(current))
- goto signal;
- if_lock(t);
- }
- if_unlock(t);
- return 1;
- signal:
- return 0;
+ if (signal_pending(current))
+ goto signal;
+ if_lock(t);
+ }
+ if_unlock(t);
+ return 1;
+signal:
+ return 0;
}
-static int pktgen_wait_all_threads_run(void)
+static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
{
- struct pktgen_thread *t = pktgen_threads;
+ struct pktgen_thread *t;
int sig = 1;
-
- while (t) {
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
sig = pktgen_wait_thread_run(t);
- if( sig == 0 ) break;
- thread_lock();
- t=t->next;
- thread_unlock();
- }
- if(sig == 0) {
- thread_lock();
- while (t) {
- t->control |= (T_STOP);
- t=t->next;
- }
- thread_unlock();
+ if (sig == 0)
+ break;
}
+
+ if (sig == 0)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_STOP);
+
+ mutex_unlock(&pktgen_thread_lock);
return sig;
}
-static void pktgen_run_all_threads(void)
+static void pktgen_run_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t = pktgen_threads;
+ struct pktgen_thread *t;
- PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n"));
+ func_enter();
- thread_lock();
+ mutex_lock(&pktgen_thread_lock);
- while(t) {
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
t->control |= (T_RUN);
- t = t->next;
- }
- thread_unlock();
- schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
-
- pktgen_wait_all_threads_run();
-}
+ mutex_unlock(&pktgen_thread_lock);
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
-static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+ pktgen_wait_all_threads_run(pn);
+}
+
+static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
- __u64 total_us, bps, mbps, pps, idle;
- char *p = pkt_dev->result;
+ struct pktgen_thread *t;
- total_us = pkt_dev->stopped_at - pkt_dev->started_at;
+ func_enter();
- idle = pkt_dev->idle_acc;
+ mutex_lock(&pktgen_thread_lock);
- p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
- (unsigned long long) total_us,
- (unsigned long long)(total_us - idle),
- (unsigned long long) idle,
- (unsigned long long) pkt_dev->sofar,
- pkt_dev->cur_pkt_size, nr_frags);
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_REMDEVALL);
- pps = pkt_dev->sofar * USEC_PER_SEC;
+ mutex_unlock(&pktgen_thread_lock);
- while ((total_us >> 32) != 0) {
- pps >>= 1;
- total_us >>= 1;
- }
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
- do_div(pps, total_us);
-
- bps = pps * 8 * pkt_dev->cur_pkt_size;
+ pktgen_wait_all_threads_run(pn);
+}
- mbps = bps;
- do_div(mbps, 1000000);
- p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
- (unsigned long long) pps,
- (unsigned long long) mbps,
- (unsigned long long) bps,
- (unsigned long long) pkt_dev->errors);
+static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+{
+ __u64 bps, mbps, pps;
+ char *p = pkt_dev->result;
+ ktime_t elapsed = ktime_sub(pkt_dev->stopped_at,
+ pkt_dev->started_at);
+ ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
+
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
+ (unsigned long long)ktime_to_us(elapsed),
+ (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
+ (unsigned long long)ktime_to_us(idle),
+ (unsigned long long)pkt_dev->sofar,
+ pkt_dev->cur_pkt_size, nr_frags);
+
+ pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
+ ktime_to_ns(elapsed));
+
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+
+ mbps = bps;
+ do_div(mbps, 1000000);
+ p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
+ (unsigned long long)pps,
+ (unsigned long long)mbps,
+ (unsigned long long)bps,
+ (unsigned long long)pkt_dev->errors);
}
-
/* Set stopped-at timer, remove from running list, do counters & statistics */
-
-static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
{
-
- if (!pkt_dev->running) {
- printk("pktgen: interface: %s is already stopped\n", pkt_dev->ifname);
- return -EINVAL;
- }
+ int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
- pkt_dev->stopped_at = getCurUs();
- pkt_dev->running = 0;
+ if (!pkt_dev->running) {
+ pr_warning("interface: %s is already stopped\n",
+ pkt_dev->odevname);
+ return -EINVAL;
+ }
- show_results(pkt_dev, skb_shinfo(pkt_dev->skb)->nr_frags);
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ pkt_dev->stopped_at = ktime_get();
+ pkt_dev->running = 0;
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ show_results(pkt_dev, nr_frags);
- pkt_dev->skb = NULL;
-
- return 0;
+ return 0;
}
-static struct pktgen_dev *next_to_run(struct pktgen_thread *t )
+static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
{
- struct pktgen_dev *next, *best = NULL;
-
+ struct pktgen_dev *pkt_dev, *best = NULL;
+
if_lock(t);
- for(next=t->if_list; next ; next=next->next) {
- if(!next->running) continue;
- if(best == NULL) best=next;
- else if ( next->next_tx_us < best->next_tx_us)
- best = next;
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (!pkt_dev->running)
+ continue;
+ if (best == NULL)
+ best = pkt_dev;
+ else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
+ best = pkt_dev;
}
if_unlock(t);
- return best;
+ return best;
}
-static void pktgen_stop(struct pktgen_thread *t) {
- struct pktgen_dev *next = NULL;
+static void pktgen_stop(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
- PG_DEBUG(printk("pktgen: entering pktgen_stop.\n"));
+ func_enter();
- if_lock(t);
+ if_lock(t);
- for(next=t->if_list; next; next=next->next)
- pktgen_stop_device(next);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ pktgen_stop_device(pkt_dev);
+ }
- if_unlock(t);
+ if_unlock(t);
}
-static void pktgen_rem_all_ifs(struct pktgen_thread *t)
+/*
+ * one of our devices needs to be removed - find it
+ * and remove it
+ */
+static void pktgen_rem_one_if(struct pktgen_thread *t)
{
- struct pktgen_dev *cur, *next = NULL;
-
- /* Remove all devices, free mem */
-
- if_lock(t);
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ func_enter();
+
+ if_lock(t);
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ if (!cur->removal_mark)
+ continue;
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
- for(cur=t->if_list; cur; cur=next) {
- next = cur->next;
pktgen_remove_device(t, cur);
+
+ break;
}
- if_unlock(t);
+ if_unlock(t);
}
-static void pktgen_rem_thread(struct pktgen_thread *t)
+static void pktgen_rem_all_ifs(struct pktgen_thread *t)
{
- /* Remove from the thread list */
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
- struct pktgen_thread *tmp = pktgen_threads;
+ func_enter();
- remove_proc_entry(t->name, pg_proc_dir);
+ /* Remove all devices, free mem */
- thread_lock();
+ if_lock(t);
- if (tmp == t)
- pktgen_threads = tmp->next;
- else {
- while (tmp) {
- if (tmp->next == t) {
- tmp->next = t->next;
- t->next = NULL;
- break;
- }
- tmp = tmp->next;
- }
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
}
- thread_unlock();
+
+ if_unlock(t);
}
-static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
+static void pktgen_rem_thread(struct pktgen_thread *t)
{
- struct net_device *odev = NULL;
- __u64 idle_start = 0;
- int ret;
+ /* Remove from the thread list */
+ remove_proc_entry(t->tsk->comm, t->net->proc_dir);
+}
- odev = pkt_dev->odev;
-
- if (pkt_dev->delay_us || pkt_dev->delay_ns) {
- u64 now;
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
+ schedule();
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
- now = getCurUs();
- if (now < pkt_dev->next_tx_us)
- spin(pkt_dev, pkt_dev->next_tx_us);
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
- /* This is max DELAY, this has special meaning of
- * "never transmit"
- */
- if (pkt_dev->delay_us == 0x7FFFFFFF) {
- pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us;
- pkt_dev->next_tx_ns = pkt_dev->delay_ns;
- goto out;
- }
+ while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+ if (signal_pending(current))
+ break;
+
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
}
-
- if (netif_queue_stopped(odev) || need_resched()) {
- idle_start = getCurUs();
-
- if (!netif_running(odev)) {
- pktgen_stop_device(pkt_dev);
- goto out;
- }
- if (need_resched())
- schedule();
-
- pkt_dev->idle_acc += getCurUs() - idle_start;
-
- if (netif_queue_stopped(odev)) {
- pkt_dev->next_tx_us = getCurUs(); /* TODO */
- pkt_dev->next_tx_ns = 0;
- goto out; /* Try the next interface */
- }
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
+
+static void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+ struct net_device *odev = pkt_dev->odev;
+ netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
+ = odev->netdev_ops->ndo_start_xmit;
+ struct netdev_queue *txq;
+ u16 queue_map;
+ int ret;
+
+ /* If device is offline, then don't send */
+ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+ pktgen_stop_device(pkt_dev);
+ return;
}
-
- if (pkt_dev->last_ok || !pkt_dev->skb) {
- if ((++pkt_dev->clone_count >= pkt_dev->clone_skb ) || (!pkt_dev->skb)) {
- /* build a new pkt */
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
-
- pkt_dev->skb = fill_packet(odev, pkt_dev);
- if (pkt_dev->skb == NULL) {
- printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n");
- schedule();
- pkt_dev->clone_count--; /* back out increment, OOM */
- goto out;
- }
- pkt_dev->allocated_skbs++;
- pkt_dev->clone_count = 0; /* reset counter */
- }
+
+ /* This is max DELAY, this has special meaning of
+ * "never transmit"
+ */
+ if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+ pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
+ return;
}
-
- spin_lock_bh(&odev->xmit_lock);
- if (!netif_queue_stopped(odev)) {
-
- atomic_inc(&(pkt_dev->skb->users));
-retry_now:
- ret = odev->hard_start_xmit(pkt_dev->skb, odev);
- if (likely(ret == NETDEV_TX_OK)) {
- pkt_dev->last_ok = 1;
- pkt_dev->sofar++;
- pkt_dev->seq_num++;
- pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
-
- } else if (ret == NETDEV_TX_LOCKED
- && (odev->features & NETIF_F_LLTX)) {
- cpu_relax();
- goto retry_now;
- } else { /* Retry it next time */
-
- atomic_dec(&(pkt_dev->skb->users));
-
- if (debug && net_ratelimit())
- printk(KERN_INFO "pktgen: Hard xmit error\n");
-
- pkt_dev->errors++;
- pkt_dev->last_ok = 0;
+
+ /* If no skb or clone count exhausted then get new one */
+ if (!pkt_dev->skb || (pkt_dev->last_ok &&
+ ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ /* build a new pkt */
+ kfree_skb(pkt_dev->skb);
+
+ pkt_dev->skb = fill_packet(odev, pkt_dev);
+ if (pkt_dev->skb == NULL) {
+ pr_err("ERROR: couldn't allocate skb in fill_packet\n");
+ schedule();
+ pkt_dev->clone_count--; /* back out increment, OOM */
+ return;
}
+ pkt_dev->last_pkt_size = pkt_dev->skb->len;
+ pkt_dev->allocated_skbs++;
+ pkt_dev->clone_count = 0; /* reset counter */
+ }
- pkt_dev->next_tx_us = getCurUs();
- pkt_dev->next_tx_ns = 0;
+ if (pkt_dev->delay && pkt_dev->last_ok)
+ spin(pkt_dev, pkt_dev->next_tx);
- pkt_dev->next_tx_us += pkt_dev->delay_us;
- pkt_dev->next_tx_ns += pkt_dev->delay_ns;
+ queue_map = skb_get_queue_mapping(pkt_dev->skb);
+ txq = netdev_get_tx_queue(odev, queue_map);
- if (pkt_dev->next_tx_ns > 1000) {
- pkt_dev->next_tx_us++;
- pkt_dev->next_tx_ns -= 1000;
- }
- }
+ local_bh_disable();
+
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
- else { /* Retry it next time */
- pkt_dev->last_ok = 0;
- pkt_dev->next_tx_us = getCurUs(); /* TODO */
- pkt_dev->next_tx_ns = 0;
- }
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
+ ret = NETDEV_TX_BUSY;
+ pkt_dev->last_ok = 0;
+ goto unlock;
+ }
+ atomic_inc(&(pkt_dev->skb->users));
+ ret = (*xmit)(pkt_dev->skb, odev);
+
+ switch (ret) {
+ case NETDEV_TX_OK:
+ txq_trans_update(txq);
+ pkt_dev->last_ok = 1;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ case NET_XMIT_POLICED:
+ /* skb has been consumed */
+ pkt_dev->errors++;
+ break;
+ default: /* Drivers are not supposed to return other values! */
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ pkt_dev->errors++;
+ /* fallthru */
+ case NETDEV_TX_LOCKED:
+ case NETDEV_TX_BUSY:
+ /* Retry it next time */
+ atomic_dec(&(pkt_dev->skb->users));
+ pkt_dev->last_ok = 0;
+ }
+unlock:
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
- spin_unlock_bh(&odev->xmit_lock);
-
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- if (atomic_read(&(pkt_dev->skb->users)) != 1) {
- idle_start = getCurUs();
- while (atomic_read(&(pkt_dev->skb->users)) != 1) {
- if (signal_pending(current)) {
- break;
- }
- schedule();
- }
- pkt_dev->idle_acc += getCurUs() - idle_start;
- }
-
+ pktgen_wait_for_skb(pkt_dev);
+
/* Done with this */
pktgen_stop_device(pkt_dev);
- }
- out:;
- }
+ }
+}
-/*
+/*
* Main loop of the thread goes here
*/
-static void pktgen_thread_worker(struct pktgen_thread *t)
+static int pktgen_thread_worker(void *arg)
{
DEFINE_WAIT(wait);
- struct pktgen_dev *pkt_dev = NULL;
+ struct pktgen_thread *t = arg;
+ struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- sigset_t tmpsig;
- u32 max_before_softirq;
- u32 tx_since_softirq = 0;
-
- daemonize("pktgen/%d", cpu);
-
- /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */
-
- spin_lock_irq(&current->sighand->siglock);
- tmpsig = current->blocked;
- siginitsetinv(&current->blocked,
- sigmask(SIGKILL) |
- sigmask(SIGSTOP)|
- sigmask(SIGTERM));
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- /* Migrate to the right CPU */
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
- if (smp_processor_id() != cpu)
- BUG();
+ BUG_ON(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
- t->control &= ~(T_TERMINATE);
- t->control &= ~(T_RUN);
- t->control &= ~(T_STOP);
- t->control &= ~(T_REMDEV);
-
- t->pid = current->pid;
+ pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
- PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid));
+ set_current_state(TASK_INTERRUPTIBLE);
- max_before_softirq = t->max_before_softirq;
-
- __set_current_state(TASK_INTERRUPTIBLE);
- mb();
+ set_freezable();
- while (1) {
-
- __set_current_state(TASK_RUNNING);
+ while (!kthread_should_stop()) {
+ pkt_dev = next_to_run(t);
- /*
- * Get next dev to xmit -- if any.
- */
+ if (unlikely(!pkt_dev && t->control == 0)) {
+ if (t->net->pktgen_exiting)
+ break;
+ wait_event_interruptible_timeout(t->queue,
+ t->control != 0,
+ HZ/10);
+ try_to_freeze();
+ continue;
+ }
- pkt_dev = next_to_run(t);
-
- if (pkt_dev) {
+ __set_current_state(TASK_RUNNING);
+ if (likely(pkt_dev)) {
pktgen_xmit(pkt_dev);
- /*
- * We like to stay RUNNING but must also give
- * others fair share.
- */
-
- tx_since_softirq += pkt_dev->last_ok;
-
- if (tx_since_softirq > max_before_softirq) {
- if (local_softirq_pending())
- do_softirq();
- tx_since_softirq = 0;
- }
- } else {
- prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE);
- schedule_timeout(HZ/10);
- finish_wait(&(t->queue), &wait);
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
}
- /*
- * Back from sleep, either due to the timeout or signal.
- * We check if we have any "posted" work for us.
- */
-
- if (t->control & T_TERMINATE || signal_pending(current))
- /* we received a request to terminate ourself */
- break;
-
-
- if(t->control & T_STOP) {
+ if (t->control & T_STOP) {
pktgen_stop(t);
t->control &= ~(T_STOP);
}
- if(t->control & T_RUN) {
+ if (t->control & T_RUN) {
pktgen_run(t);
t->control &= ~(T_RUN);
}
- if(t->control & T_REMDEV) {
+ if (t->control & T_REMDEVALL) {
pktgen_rem_all_ifs(t);
+ t->control &= ~(T_REMDEVALL);
+ }
+
+ if (t->control & T_REMDEV) {
+ pktgen_rem_one_if(t);
t->control &= ~(T_REMDEV);
}
- if (need_resched())
- schedule();
- }
+ try_to_freeze();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ pr_debug("%s stopping all device\n", t->tsk->comm);
+ pktgen_stop(t);
- PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name));
- pktgen_stop(t);
+ pr_debug("%s removing all device\n", t->tsk->comm);
+ pktgen_rem_all_ifs(t);
- PG_DEBUG(printk("pktgen: %s removing all device\n", t->name));
- pktgen_rem_all_ifs(t);
+ pr_debug("%s removing thread\n", t->tsk->comm);
+ pktgen_rem_thread(t);
- PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name));
- pktgen_rem_thread(t);
+ /* Wait for kthread_stop */
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
}
-static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* ifname)
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact)
{
- struct pktgen_dev *pkt_dev = NULL;
- if_lock(t);
+ struct pktgen_dev *p, *pkt_dev = NULL;
+ size_t len = strlen(ifname);
- for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) {
- if (strncmp(pkt_dev->ifname, ifname, IFNAMSIZ) == 0) {
- break;
- }
- }
+ if_lock(t);
+ list_for_each_entry(p, &t->if_list, list)
+ if (strncmp(p->odevname, ifname, len) == 0) {
+ if (p->odevname[len]) {
+ if (exact || p->odevname[len] != '@')
+ continue;
+ }
+ pkt_dev = p;
+ break;
+ }
- if_unlock(t);
- PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname,pkt_dev));
- return pkt_dev;
+ if_unlock(t);
+ pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
+ return pkt_dev;
}
-/*
- * Adds a dev at front of if_list.
+/*
+ * Adds a dev at front of if_list.
*/
-static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
+static int add_dev_to_thread(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
{
int rv = 0;
-
- if_lock(t);
-
- if (pkt_dev->pg_thread) {
- printk("pktgen: ERROR: already assigned to a thread.\n");
- rv = -EBUSY;
- goto out;
- }
- pkt_dev->next =t->if_list; t->if_list=pkt_dev;
- pkt_dev->pg_thread = t;
+
+ if_lock(t);
+
+ if (pkt_dev->pg_thread) {
+ pr_err("ERROR: already assigned to a thread\n");
+ rv = -EBUSY;
+ goto out;
+ }
+
+ list_add(&pkt_dev->list, &t->if_list);
+ pkt_dev->pg_thread = t;
pkt_dev->running = 0;
- out:
- if_unlock(t);
- return rv;
+out:
+ if_unlock(t);
+ return rv;
}
/* Called under thread lock */
-static int pktgen_add_device(struct pktgen_thread *t, const char* ifname)
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
{
- struct pktgen_dev *pkt_dev;
- struct proc_dir_entry *pe;
-
+ struct pktgen_dev *pkt_dev;
+ int err;
+ int node = cpu_to_node(t->cpu);
+
/* We don't allow a device to be on several threads */
- pkt_dev = __pktgen_NN_threads(ifname, FIND);
+ pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);
if (pkt_dev) {
- printk("pktgen: ERROR: interface already used.\n");
- return -EBUSY;
- }
+ pr_err("ERROR: interface already used\n");
+ return -EBUSY;
+ }
- pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
+ pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
if (!pkt_dev)
return -ENOMEM;
- pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state));
+ strcpy(pkt_dev->odevname, ifname);
+ pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
+ node);
if (pkt_dev->flows == NULL) {
kfree(pkt_dev);
return -ENOMEM;
}
- memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state));
- pkt_dev->min_pkt_size = ETH_ZLEN;
- pkt_dev->max_pkt_size = ETH_ZLEN;
+ pkt_dev->removal_mark = 0;
pkt_dev->nfrags = 0;
- pkt_dev->clone_skb = pg_clone_skb_d;
- pkt_dev->delay_us = pg_delay_d / 1000;
- pkt_dev->delay_ns = pg_delay_d % 1000;
+ pkt_dev->delay = pg_delay_d;
pkt_dev->count = pg_count_d;
pkt_dev->sofar = 0;
- pkt_dev->udp_src_min = 9; /* sink port */
+ pkt_dev->udp_src_min = 9; /* sink port */
pkt_dev->udp_src_max = 9;
pkt_dev->udp_dst_min = 9;
pkt_dev->udp_dst_max = 9;
-
- strncpy(pkt_dev->ifname, ifname, IFNAMSIZ);
-
- if (! pktgen_setup_dev(pkt_dev)) {
- printk("pktgen: ERROR: pktgen_setup_dev failed.\n");
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -ENODEV;
- }
-
- pe = create_proc_entry(ifname, 0600, pg_proc_dir);
- if (!pe) {
- printk("pktgen: cannot create %s/%s procfs entry.\n",
+ pkt_dev->vlan_p = 0;
+ pkt_dev->vlan_cfi = 0;
+ pkt_dev->vlan_id = 0xffff;
+ pkt_dev->svlan_p = 0;
+ pkt_dev->svlan_cfi = 0;
+ pkt_dev->svlan_id = 0xffff;
+ pkt_dev->node = -1;
+
+ err = pktgen_setup_dev(t->net, pkt_dev, ifname);
+ if (err)
+ goto out1;
+ if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+ pkt_dev->clone_skb = pg_clone_skb_d;
+
+ pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
+ &pktgen_if_fops, pkt_dev);
+ if (!pkt_dev->entry) {
+ pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, ifname);
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -EINVAL;
+ err = -EINVAL;
+ goto out2;
}
- pe->proc_fops = &pktgen_if_fops;
- pe->data = pkt_dev;
+#ifdef CONFIG_XFRM
+ pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+ pkt_dev->ipsproto = IPPROTO_ESP;
+
+ /* xfrm tunnel mode needs additional dst to extract outter
+ * ip header protocol/ttl/id field, here creat a phony one.
+ * instead of looking for a valid rt, which definitely hurting
+ * performance under such circumstance.
+ */
+ pkt_dev->dstops.family = AF_INET;
+ pkt_dev->dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
+ pkt_dev->dst.child = &pkt_dev->dst;
+ pkt_dev->dst.ops = &pkt_dev->dstops;
+#endif
return add_dev_to_thread(t, pkt_dev);
+out2:
+ dev_put(pkt_dev->odev);
+out1:
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return err;
}
-static struct pktgen_thread * __init pktgen_find_thread(const char* name)
+static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
{
- struct pktgen_thread *t = NULL;
+ struct pktgen_thread *t;
+ struct proc_dir_entry *pe;
+ struct task_struct *p;
- thread_lock();
+ t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!t) {
+ pr_err("ERROR: out of memory, can't create new thread\n");
+ return -ENOMEM;
+ }
- t = pktgen_threads;
- while (t) {
- if (strcmp(t->name, name) == 0)
- break;
+ spin_lock_init(&t->if_lock);
+ t->cpu = cpu;
- t = t->next;
- }
- thread_unlock();
- return t;
-}
+ INIT_LIST_HEAD(&t->if_list);
-static int __init pktgen_create_thread(const char* name, int cpu)
-{
- struct pktgen_thread *t = NULL;
- struct proc_dir_entry *pe;
+ list_add_tail(&t->th_list, &pn->pktgen_threads);
+ init_completion(&t->start_done);
- if (strlen(name) > 31) {
- printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n");
- return -EINVAL;
- }
-
- if (pktgen_find_thread(name)) {
- printk("pktgen: ERROR: thread: %s already exists\n", name);
- return -EINVAL;
- }
-
- t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL);
- if (!t) {
- printk("pktgen: ERROR: out of memory, can't create new thread.\n");
- return -ENOMEM;
- }
-
- strcpy(t->name, name);
- spin_lock_init(&t->if_lock);
- t->cpu = cpu;
-
- pe = create_proc_entry(t->name, 0600, pg_proc_dir);
- if (!pe) {
- printk("pktgen: cannot create %s/%s procfs entry.\n",
- PG_PROC_DIR, t->name);
- kfree(t);
- return -EINVAL;
- }
-
- pe->proc_fops = &pktgen_thread_fops;
- pe->data = t;
+ p = kthread_create_on_node(pktgen_thread_worker,
+ t,
+ cpu_to_node(cpu),
+ "kpktgend_%d", cpu);
+ if (IS_ERR(p)) {
+ pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+ list_del(&t->th_list);
+ kfree(t);
+ return PTR_ERR(p);
+ }
+ kthread_bind(p, cpu);
+ t->tsk = p;
- t->next = pktgen_threads;
- pktgen_threads = t;
+ pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
+ &pktgen_thread_fops, t);
+ if (!pe) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, t->tsk->comm);
+ kthread_stop(p);
+ list_del(&t->th_list);
+ kfree(t);
+ return -EINVAL;
+ }
- if (kernel_thread((void *) pktgen_thread_worker, (void *) t,
- CLONE_FS | CLONE_FILES | CLONE_SIGHAND) < 0)
- printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu);
+ t->net = pn;
+ wake_up_process(p);
+ wait_for_completion(&t->start_done);
return 0;
}
-/*
- * Removes a device from the thread if_list.
+/*
+ * Removes a device from the thread if_list.
*/
-static void _rem_dev_from_if_list(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
+static void _rem_dev_from_if_list(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
{
- struct pktgen_dev *i, *prev = NULL;
-
- i = t->if_list;
+ struct list_head *q, *n;
+ struct pktgen_dev *p;
- while(i) {
- if(i == pkt_dev) {
- if(prev) prev->next = i->next;
- else t->if_list = NULL;
- break;
- }
- prev = i;
- i=i->next;
+ list_for_each_safe(q, n, &t->if_list) {
+ p = list_entry(q, struct pktgen_dev, list);
+ if (p == pkt_dev)
+ list_del(&p->list);
}
}
-static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
+static int pktgen_remove_device(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
{
+ pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
- PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev));
+ if (pkt_dev->running) {
+ pr_warning("WARNING: trying to remove a running interface, stopping it now\n");
+ pktgen_stop_device(pkt_dev);
+ }
- if (pkt_dev->running) {
- printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n");
- pktgen_stop_device(pkt_dev);
- }
-
- /* Dis-associate from the interface */
+ /* Dis-associate from the interface */
if (pkt_dev->odev) {
dev_put(pkt_dev->odev);
- pkt_dev->odev = NULL;
- }
-
+ pkt_dev->odev = NULL;
+ }
+
/* And update the thread if_list */
_rem_dev_from_if_list(t, pkt_dev);
- /* Clean up proc file system */
-
- remove_proc_entry(pkt_dev->ifname, pg_proc_dir);
+ if (pkt_dev->entry)
+ proc_remove(pkt_dev->entry);
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
kfree(pkt_dev);
- return 0;
+ return 0;
}
-static int __init pg_init(void)
+static int __net_init pg_net_init(struct net *net)
{
- int cpu;
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
struct proc_dir_entry *pe;
-
- printk(version);
-
- pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net);
- if (!pg_proc_dir)
+ int cpu, ret = 0;
+
+ pn->net = net;
+ INIT_LIST_HEAD(&pn->pktgen_threads);
+ pn->pktgen_exiting = false;
+ pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+ if (!pn->proc_dir) {
+ pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
return -ENODEV;
- pg_proc_dir->owner = THIS_MODULE;
+ }
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
+ if (pe == NULL) {
+ pr_err("cannot create %s procfs entry\n", PGCTRL);
+ ret = -EINVAL;
+ goto remove;
+ }
- pe = create_proc_entry(PGCTRL, 0600, pg_proc_dir);
- if (pe == NULL) {
- printk("pktgen: ERROR: cannot create %s procfs entry.\n", PGCTRL);
- proc_net_remove(PG_PROC_DIR);
- return -EINVAL;
- }
+ for_each_online_cpu(cpu) {
+ int err;
- pe->proc_fops = &pktgen_fops;
- pe->data = NULL;
+ err = pktgen_create_thread(cpu, pn);
+ if (err)
+ pr_warn("Cannot create thread for cpu %d (%d)\n",
+ cpu, err);
+ }
- /* Register us to receive netdevice events */
- register_netdevice_notifier(&pktgen_notifier_block);
-
- for_each_online_cpu(cpu) {
- char buf[30];
+ if (list_empty(&pn->pktgen_threads)) {
+ pr_err("Initialization failed for all threads\n");
+ ret = -ENODEV;
+ goto remove_entry;
+ }
- sprintf(buf, "kpktgend_%i", cpu);
- pktgen_create_thread(buf, cpu);
- }
- return 0;
+ return 0;
+
+remove_entry:
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+remove:
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+ return ret;
}
-static void __exit pg_cleanup(void)
+static void __net_exit pg_net_exit(struct net *net)
{
- wait_queue_head_t queue;
- init_waitqueue_head(&queue);
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
+ struct pktgen_thread *t;
+ struct list_head *q, *n;
+ LIST_HEAD(list);
- /* Stop all interfaces & threads */
+ /* Stop all interfaces & threads */
+ pn->pktgen_exiting = true;
- while (pktgen_threads) {
- struct pktgen_thread *t = pktgen_threads;
- pktgen_threads->control |= (T_TERMINATE);
+ mutex_lock(&pktgen_thread_lock);
+ list_splice_init(&pn->pktgen_threads, &list);
+ mutex_unlock(&pktgen_thread_lock);
- wait_event_interruptible_timeout(queue, (t != pktgen_threads), HZ);
- }
+ list_for_each_safe(q, n, &list) {
+ t = list_entry(q, struct pktgen_thread, th_list);
+ list_del(&t->th_list);
+ kthread_stop(t->tsk);
+ kfree(t);
+ }
- /* Un-register us from receiving netdevice events */
- unregister_netdevice_notifier(&pktgen_notifier_block);
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+}
+
+static struct pernet_operations pg_net_ops = {
+ .init = pg_net_init,
+ .exit = pg_net_exit,
+ .id = &pg_net_id,
+ .size = sizeof(struct pktgen_net),
+};
- /* Clean up proc file system */
- remove_proc_entry(PGCTRL, pg_proc_dir);
- proc_net_remove(PG_PROC_DIR);
+static int __init pg_init(void)
+{
+ int ret = 0;
+
+ pr_info("%s", version);
+ ret = register_pernet_subsys(&pg_net_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&pktgen_notifier_block);
+ if (ret)
+ unregister_pernet_subsys(&pg_net_ops);
+
+ return ret;
}
+static void __exit pg_cleanup(void)
+{
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ unregister_pernet_subsys(&pg_net_ops);
+}
module_init(pg_init);
module_exit(pg_cleanup);
-MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se");
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>");
MODULE_DESCRIPTION("Packet Generator tool");
MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);
module_param(pg_count_d, int, 0);
+MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject");
module_param(pg_delay_d, int, 0);
+MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)");
module_param(pg_clone_skb_d, int, 0);
+MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet");
module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Enable debugging of pktgen module");
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 00000000000..d3027a73fd4
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,141 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x40 ; PTP_CLASS_V2_VLAN
+ * ret a ; return PTP class
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x30 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct sk_filter *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return SK_RUN_FILTER(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 15, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 12, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000040 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000030 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index b8203de5ff0..467f326126e 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,8 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tcp.h>
+#include <linux/vmalloc.h>
#include <net/request_sock.h>
@@ -25,34 +27,40 @@
* but then some measure against one socket starving all other sockets
* would be needed.
*
- * It was 128 by default. Experiments with real servers show, that
+ * The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
- * Further increasing requires to change hash table size.
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
+ * Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
int reqsk_queue_alloc(struct request_sock_queue *queue,
- const int nr_table_entries)
+ unsigned int nr_table_entries)
{
- const int lopt_size = sizeof(struct listen_sock) +
- nr_table_entries * sizeof(struct request_sock *);
- struct listen_sock *lopt = kmalloc(lopt_size, GFP_KERNEL);
-
+ size_t lopt_size = sizeof(struct listen_sock);
+ struct listen_sock *lopt;
+
+ nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+ nr_table_entries = max_t(u32, nr_table_entries, 8);
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+ lopt_size += nr_table_entries * sizeof(struct request_sock *);
+ if (lopt_size > PAGE_SIZE)
+ lopt = vzalloc(lopt_size);
+ else
+ lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
- memset(lopt, 0, lopt_size);
-
- for (lopt->max_qlen_log = 6;
- (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
+ for (lopt->max_qlen_log = 3;
+ (1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
- queue->rskq_accept_head = queue->rskq_accept_head = NULL;
- queue->rskq_defer_accept = 0;
+ queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
write_lock_bh(&queue->syn_wait_lock);
@@ -62,15 +70,48 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
return 0;
}
-EXPORT_SYMBOL(reqsk_queue_alloc);
+void __reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ struct listen_sock *lopt;
+ size_t lopt_size;
+
+ /*
+ * this is an error recovery path only
+ * no locking needed and the lopt is not NULL
+ */
+
+ lopt = queue->listen_opt;
+ lopt_size = sizeof(struct listen_sock) +
+ lopt->nr_table_entries * sizeof(struct request_sock *);
+
+ if (lopt_size > PAGE_SIZE)
+ vfree(lopt);
+ else
+ kfree(lopt);
+}
+
+static inline struct listen_sock *reqsk_queue_yank_listen_sk(
+ struct request_sock_queue *queue)
+{
+ struct listen_sock *lopt;
+
+ write_lock_bh(&queue->syn_wait_lock);
+ lopt = queue->listen_opt;
+ queue->listen_opt = NULL;
+ write_unlock_bh(&queue->syn_wait_lock);
+
+ return lopt;
+}
void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+ size_t lopt_size = sizeof(struct listen_sock) +
+ lopt->nr_table_entries * sizeof(struct request_sock *);
if (lopt->qlen != 0) {
- int i;
+ unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
@@ -83,8 +124,101 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
}
}
- BUG_TRAP(lopt->qlen == 0);
- kfree(lopt);
+ WARN_ON(lopt->qlen != 0);
+ if (lopt_size > PAGE_SIZE)
+ vfree(lopt);
+ else
+ kfree(lopt);
}
-EXPORT_SYMBOL(reqsk_queue_destroy);
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * When a TFO req is created, it needs to sock_hold its listener to prevent
+ * the latter data structure from going away.
+ *
+ * This function also sets "treq->listener" to NULL and unreference listener
+ * socket. treq->listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = tcp_rsk(req)->listener;
+ struct fastopen_queue *fastopenq =
+ inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ tcp_sk(sk)->fastopen_rsk = NULL;
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->listener = NULL;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+ reqsk_free(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+ sock_put(lsk);
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8700379685e..1063996f831 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -16,13 +16,11 @@
* Vitaly E. Lavrov RTA_OK arithmetics was wrong.
*/
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -35,10 +33,13 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/if_addr.h>
+#include <linux/if_bridge.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/string.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -49,104 +50,567 @@
#include <net/udp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
-#include <net/netlink.h>
+#include <net/fib_rules.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+
+struct rtnl_link {
+ rtnl_doit_func doit;
+ rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
+};
-DECLARE_MUTEX(rtnl_sem);
+static DEFINE_MUTEX(rtnl_mutex);
void rtnl_lock(void)
{
- rtnl_shlock();
+ mutex_lock(&rtnl_mutex);
}
+EXPORT_SYMBOL(rtnl_lock);
-int rtnl_lock_interruptible(void)
+void __rtnl_unlock(void)
{
- return down_interruptible(&rtnl_sem);
+ mutex_unlock(&rtnl_mutex);
}
-
+
void rtnl_unlock(void)
{
- rtnl_shunlock();
-
+ /* This fellow will unlock it for us. */
netdev_run_todo();
}
+EXPORT_SYMBOL(rtnl_unlock);
+
+int rtnl_trylock(void)
+{
+ return mutex_trylock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rtnl_is_held(void)
+{
+ return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
+static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+
+static inline int rtm_msgindex(int msgtype)
+{
+ int msgindex = msgtype - RTM_BASE;
+
+ /*
+ * msgindex < 0 implies someone tried to register a netlink
+ * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+ * the message type has not been added to linux/rtnetlink.h
+ */
+ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+ return msgindex;
+}
+
+static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].doit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].doit;
+}
-int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
+static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
{
- memset(tb, 0, sizeof(struct rtattr*)*maxattr);
+ struct rtnl_link *tab;
- while (RTA_OK(rta, len)) {
- unsigned flavor = rta->rta_type;
- if (flavor && flavor <= maxattr)
- tb[flavor-1] = rta;
- rta = RTA_NEXT(rta, len);
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].dumpit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].dumpit;
+}
+
+static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].calcit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].calcit;
+}
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @calcit: Function pointer to calc size of dump message
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
+{
+ struct rtnl_link *tab;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL) {
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
+ if (tab == NULL)
+ return -ENOBUFS;
+
+ rtnl_msg_handlers[protocol] = tab;
}
+
+ if (doit)
+ tab[msgindex].doit = doit;
+
+ if (dumpit)
+ tab[msgindex].dumpit = dumpit;
+
+ if (calcit)
+ tab[msgindex].calcit = calcit;
+
return 0;
}
+EXPORT_SYMBOL_GPL(__rtnl_register);
-struct sock *rtnl;
+/**
+ * rtnl_register - Register a rtnetlink message type
+ *
+ * Identical to __rtnl_register() but panics on failure. This is useful
+ * as failure of this function is very unlikely, it can only happen due
+ * to lack of memory when allocating the chain to store all message
+ * handlers for a protocol. Meant for use in init functions where lack
+ * of memory implies no sense in continuing.
+ */
+void rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
+{
+ if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
+ panic("Unable to register rtnetlink message handler, "
+ "protocol = %d, message type = %d\n",
+ protocol, msgtype);
+}
+EXPORT_SYMBOL_GPL(rtnl_register);
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_unregister(int protocol, int msgtype)
+{
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ if (rtnl_msg_handlers[protocol] == NULL)
+ return -ENOENT;
-struct rtnetlink_link * rtnetlink_links[NPROTO];
+ rtnl_msg_handlers[protocol][msgindex].doit = NULL;
+ rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
-static const int rtm_min[RTM_NR_FAMILIES] =
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister);
+
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregster() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
{
- [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
- [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
- [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
- [RTM_FAM(RTM_NEWNEIGH)] = NLMSG_LENGTH(sizeof(struct ndmsg)),
- [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
- [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)),
- [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_NEWNEIGHTBL)] = NLMSG_LENGTH(sizeof(struct ndtmsg)),
-};
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
-static const int rta_max[RTM_NR_FAMILIES] =
-{
- [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX,
- [RTM_FAM(RTM_NEWADDR)] = IFA_MAX,
- [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX,
- [RTM_FAM(RTM_NEWNEIGH)] = NDA_MAX,
- [RTM_FAM(RTM_NEWRULE)] = RTA_MAX,
- [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
- [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
- [RTM_FAM(RTM_NEWNEIGHTBL)] = NDTA_MAX,
-};
+ kfree(rtnl_msg_handlers[protocol]);
+ rtnl_msg_handlers[protocol] = NULL;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+static LIST_HEAD(link_ops);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
+ if (!ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ list_add_tail(&ops->list, &link_ops);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ int err;
+
+ rtnl_lock();
+ err = __rtnl_link_register(ops);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ struct net_device *dev;
+ LIST_HEAD(list_kill);
+
+ for_each_netdev(net, dev) {
+ if (dev->rtnl_link_ops == ops)
+ ops->dellink(dev, &list_kill);
+ }
+ unregister_netdevice_many(&list_kill);
+}
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ struct net *net;
+
+ for_each_net(net) {
+ __rtnl_kill_links(net, ops);
+ }
+ list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ /* Close the race with cleanup_net() */
+ mutex_lock(&net_mutex);
+ rtnl_lock_unregistering_all();
+ __rtnl_link_unregister(ops);
+ rtnl_unlock();
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
-void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
{
- struct rtattr *rta;
- int size = RTA_LENGTH(attrlen);
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
- rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
- rta->rta_type = attrtype;
- rta->rta_len = size;
- memcpy(RTA_DATA(rta), data, attrlen);
- memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops || !ops->get_slave_size)
+ return 0;
+ /* IFLA_INFO_SLAVE_DATA + nested data */
+ return nla_total_size(sizeof(struct nlattr)) +
+ ops->get_slave_size(master_dev, dev);
}
-size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
+static size_t rtnl_link_get_size(const struct net_device *dev)
{
- size_t ret = RTA_PAYLOAD(rta);
- char *src = RTA_DATA(rta);
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ size_t size;
+
+ if (!ops)
+ return 0;
+
+ size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
+
+ if (ops->get_size)
+ /* IFLA_INFO_DATA + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ ops->get_size(dev);
- if (ret > 0 && src[ret - 1] == '\0')
- ret--;
- if (size > 0) {
- size_t len = (ret >= size) ? size - 1 : ret;
- memset(dest, 0, size);
- memcpy(dest, src, len);
+ if (ops->get_xstats_size)
+ /* IFLA_INFO_XSTATS */
+ size += nla_total_size(ops->get_xstats_size(dev));
+
+ size += rtnl_link_get_slave_info_data_size(dev);
+
+ return size;
+}
+
+static LIST_HEAD(rtnl_af_ops);
+
+static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+{
+ const struct rtnl_af_ops *ops;
+
+ list_for_each_entry(ops, &rtnl_af_ops, list) {
+ if (ops->family == family)
+ return ops;
}
- return ret;
+
+ return NULL;
+}
+
+/**
+ * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+void rtnl_af_register(struct rtnl_af_ops *ops)
+{
+ rtnl_lock();
+ list_add_tail(&ops->list, &rtnl_af_ops);
+ rtnl_unlock();
}
+EXPORT_SYMBOL_GPL(rtnl_af_register);
-int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+/**
+ * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+ list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
+
+/**
+ * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ */
+void rtnl_af_unregister(struct rtnl_af_ops *ops)
{
+ rtnl_lock();
+ __rtnl_af_unregister(ops);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_unregister);
+
+static size_t rtnl_link_get_af_size(const struct net_device *dev)
+{
+ struct rtnl_af_ops *af_ops;
+ size_t size;
+
+ /* IFLA_AF_SPEC */
+ size = nla_total_size(sizeof(struct nlattr));
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_link_af_size) {
+ /* AF_* + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ af_ops->get_link_af_size(dev);
+ }
+ }
+
+ return size;
+}
+
+static bool rtnl_have_link_slave_info(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (master_dev && master_dev->rtnl_link_ops)
+ return true;
+ return false;
+}
+
+static int rtnl_link_slave_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ struct nlattr *slave_data;
+ int err;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_slave_info) {
+ slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA);
+ if (!slave_data)
+ return -EMSGSIZE;
+ err = ops->fill_slave_info(skb, master_dev, dev);
+ if (err < 0)
+ goto err_cancel_slave_data;
+ nla_nest_end(skb, slave_data);
+ }
+ return 0;
+
+err_cancel_slave_data:
+ nla_nest_cancel(skb, slave_data);
+ return err;
+}
+
+static int rtnl_link_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *data;
+ int err;
+
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_xstats) {
+ err = ops->fill_xstats(skb, dev);
+ if (err < 0)
+ return err;
+ }
+ if (ops->fill_info) {
+ data = nla_nest_start(skb, IFLA_INFO_DATA);
+ if (data == NULL)
+ return -EMSGSIZE;
+ err = ops->fill_info(skb, dev);
+ if (err < 0)
+ goto err_cancel_data;
+ nla_nest_end(skb, data);
+ }
+ return 0;
+
+err_cancel_data:
+ nla_nest_cancel(skb, data);
+ return err;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct nlattr *linkinfo;
+ int err = -EMSGSIZE;
+
+ linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
+
+ err = rtnl_link_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ err = rtnl_link_slave_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
+
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
+}
+
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
+{
+ struct sock *rtnl = net->rtnl;
int err = 0;
NETLINK_CB(skb).dst_group = group;
@@ -158,56 +622,395 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
return err;
}
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+ struct sock *rtnl = net->rtnl;
+
+ return nlmsg_unicast(rtnl, skb, pid);
+}
+EXPORT_SYMBOL(rtnl_unicast);
+
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *rtnl = net->rtnl;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+EXPORT_SYMBOL(rtnl_notify);
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ struct sock *rtnl = net->rtnl;
+
+ netlink_set_err(rtnl, 0, group, error);
+}
+EXPORT_SYMBOL(rtnl_set_sk_err);
+
int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
{
- struct rtattr *mx = (struct rtattr*)skb->tail;
- int i;
+ struct nlattr *mx;
+ int i, valid = 0;
+
+ mx = nla_nest_start(skb, RTA_METRICS);
+ if (mx == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (metrics[i]) {
+ valid++;
+ if (nla_put_u32(skb, i+1, metrics[i]))
+ goto nla_put_failure;
+ }
+ }
+
+ if (!valid) {
+ nla_nest_cancel(skb, mx);
+ return 0;
+ }
+
+ return nla_nest_end(skb, mx);
+
+nla_put_failure:
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rtnetlink_put_metrics);
+
+int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+ long expires, u32 error)
+{
+ struct rta_cacheinfo ci = {
+ .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
+ .rta_used = dst->__use,
+ .rta_clntref = atomic_read(&(dst->__refcnt)),
+ .rta_error = error,
+ .rta_id = id,
+ };
+
+ if (expires) {
+ unsigned long clock;
+
+ clock = jiffies_to_clock_t(abs(expires));
+ clock = min_t(unsigned long, clock, INT_MAX);
+ ci.rta_expires = (expires > 0) ? clock : -clock;
+ }
+ return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+}
+EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+ unsigned char operstate = dev->operstate;
+
+ switch (transition) {
+ case IF_OPER_UP:
+ if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_UNKNOWN) &&
+ !netif_dormant(dev))
+ operstate = IF_OPER_UP;
+ break;
- RTA_PUT(skb, RTA_METRICS, 0, NULL);
- for (i=0; i<RTAX_MAX; i++) {
- if (metrics[i])
- RTA_PUT(skb, i+1, sizeof(u32), metrics+i);
+ case IF_OPER_DORMANT:
+ if (operstate == IF_OPER_UP ||
+ operstate == IF_OPER_UNKNOWN)
+ operstate = IF_OPER_DORMANT;
+ break;
}
- mx->rta_len = skb->tail - (u8*)mx;
- if (mx->rta_len == RTA_LENGTH(0))
- skb_trim(skb, (u8*)mx - skb->data);
+
+ if (dev->operstate != operstate) {
+ write_lock_bh(&dev_base_lock);
+ dev->operstate = operstate;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ }
+}
+
+static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
+{
+ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
+ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
+}
+
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+ const struct ifinfomsg *ifm)
+{
+ unsigned int flags = ifm->ifi_flags;
+
+ /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+ if (ifm->ifi_change)
+ flags = (flags & ifm->ifi_change) |
+ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
+
+ return flags;
+}
+
+static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+ const struct rtnl_link_stats64 *b)
+{
+ a->rx_packets = b->rx_packets;
+ a->tx_packets = b->tx_packets;
+ a->rx_bytes = b->rx_bytes;
+ a->tx_bytes = b->tx_bytes;
+ a->rx_errors = b->rx_errors;
+ a->tx_errors = b->tx_errors;
+ a->rx_dropped = b->rx_dropped;
+ a->tx_dropped = b->tx_dropped;
+
+ a->multicast = b->multicast;
+ a->collisions = b->collisions;
+
+ a->rx_length_errors = b->rx_length_errors;
+ a->rx_over_errors = b->rx_over_errors;
+ a->rx_crc_errors = b->rx_crc_errors;
+ a->rx_frame_errors = b->rx_frame_errors;
+ a->rx_fifo_errors = b->rx_fifo_errors;
+ a->rx_missed_errors = b->rx_missed_errors;
+
+ a->tx_aborted_errors = b->tx_aborted_errors;
+ a->tx_carrier_errors = b->tx_carrier_errors;
+ a->tx_fifo_errors = b->tx_fifo_errors;
+ a->tx_heartbeat_errors = b->tx_heartbeat_errors;
+ a->tx_window_errors = b->tx_window_errors;
+
+ a->rx_compressed = b->rx_compressed;
+ a->tx_compressed = b->tx_compressed;
+}
+
+static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
+{
+ memcpy(v, b, sizeof(*b));
+}
+
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
+ (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int num_vfs = dev_num_vf(dev->dev.parent);
+ size_t size = nla_total_size(sizeof(struct nlattr));
+ size += nla_total_size(num_vfs * sizeof(struct nlattr));
+ size += num_vfs *
+ (nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)));
+ return size;
+ } else
+ return 0;
+}
+
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ size_t port_size = nla_total_size(4) /* PORT_VF */
+ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
+ + nla_total_size(sizeof(struct ifla_port_vsi))
+ /* PORT_VSI_TYPE */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ + nla_total_size(1) /* PROT_VDP_REQUEST */
+ + nla_total_size(2); /* PORT_VDP_RESPONSE */
+ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
+ size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+ size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+ if (dev_num_vf(dev->dev.parent))
+ return port_self_size + vf_ports_size +
+ vf_port_size * dev_num_vf(dev->dev.parent);
+ else
+ return port_self_size;
+}
+
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ + nla_total_size(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size(sizeof(struct rtnl_link_stats))
+ + nla_total_size(sizeof(struct rtnl_link_stats64))
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ + nla_total_size(4) /* IFLA_TXQLEN */
+ + nla_total_size(4) /* IFLA_WEIGHT */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_CARRIER */
+ + nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ + nla_total_size(ext_filter_mask
+ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
+}
+
+static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *vf_ports;
+ struct nlattr *vf_port;
+ int vf;
+ int err;
+
+ vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
+ if (!vf_ports)
+ return -EMSGSIZE;
+
+ for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+ vf_port = nla_nest_start(skb, IFLA_VF_PORT);
+ if (!vf_port)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_PORT_VF, vf))
+ goto nla_put_failure;
+ err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ if (err) {
+ nla_nest_cancel(skb, vf_port);
+ continue;
+ }
+ nla_nest_end(skb, vf_port);
+ }
+
+ nla_nest_end(skb, vf_ports);
+
return 0;
-rtattr_failure:
- skb_trim(skb, (u8*)mx - skb->data);
- return -1;
+nla_put_failure:
+ nla_nest_cancel(skb, vf_ports);
+ return -EMSGSIZE;
}
+static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *port_self;
+ int err;
+
+ port_self = nla_nest_start(skb, IFLA_PORT_SELF);
+ if (!port_self)
+ return -EMSGSIZE;
-static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
- int type, u32 pid, u32 seq, u32 change,
- unsigned int flags)
+ err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
+ if (err) {
+ nla_nest_cancel(skb, port_self);
+ return (err == -EMSGSIZE) ? err : 0;
+ }
+
+ nla_nest_end(skb, port_self);
+
+ return 0;
+}
+
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
{
- struct ifinfomsg *r;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
+ int err;
- nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
- r = NLMSG_DATA(nlh);
- r->ifi_family = AF_UNSPEC;
- r->__ifi_pad = 0;
- r->ifi_type = dev->type;
- r->ifi_index = dev->ifindex;
- r->ifi_flags = dev_get_flags(dev);
- r->ifi_change = change;
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
- RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name);
+ err = rtnl_port_self_fill(skb, dev);
+ if (err)
+ return err;
- if (1) {
- u32 txqlen = dev->tx_queue_len;
- RTA_PUT(skb, IFLA_TXQLEN, sizeof(txqlen), &txqlen);
+ if (dev_num_vf(dev->dev.parent)) {
+ err = rtnl_vf_ports_fill(skb, dev);
+ if (err)
+ return err;
}
- if (1) {
- u32 weight = dev->weight;
- RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight);
+ return 0;
+}
+
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_port_id ppid;
+
+ err = dev_get_phys_port_id(dev, &ppid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
}
+ if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, u32 ext_filter_mask)
+{
+ struct ifinfomsg *ifm;
+ struct nlmsghdr *nlh;
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats;
+ struct nlattr *attr, *af_spec;
+ struct rtnl_af_ops *af_ops;
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+
+ ASSERT_RTNL();
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = change;
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_GROUP, dev->group) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+#ifdef CONFIG_RPS
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+#endif
+ (dev->ifindex != dev->iflink &&
+ nla_put_u32(skb, IFLA_LINK, dev->iflink)) ||
+ (upper_dev &&
+ nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
+ nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
+ (dev->qdisc &&
+ nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+ (dev->ifalias &&
+ nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_changes)))
+ goto nla_put_failure;
+
if (1) {
struct rtnl_link_ifmap map = {
.mem_start = dev->mem_start,
@@ -217,125 +1020,500 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
.dma = dev->dma,
.port = dev->if_port,
};
- RTA_PUT(skb, IFLA_MAP, sizeof(map), &map);
+ if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ goto nla_put_failure;
}
if (dev->addr_len) {
- RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
- RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+ if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
+ nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
+ goto nla_put_failure;
}
- if (1) {
- u32 mtu = dev->mtu;
- RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu);
+ if (rtnl_phys_port_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (attr == NULL)
+ goto nla_put_failure;
+
+ stats = dev_get_stats(dev, &temp);
+ copy_rtnl_link_stats(nla_data(attr), stats);
+
+ attr = nla_reserve(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64));
+ if (attr == NULL)
+ goto nla_put_failure;
+ copy_rtnl_link_stats64(nla_data(attr), stats);
+
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
+ nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
+ && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int i;
+
+ struct nlattr *vfinfo, *vf;
+ int num_vfs = dev_num_vf(dev->dev.parent);
+
+ vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+ if (!vfinfo)
+ goto nla_put_failure;
+ for (i = 0; i < num_vfs; i++) {
+ struct ifla_vf_info ivi;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_link_state vf_linkstate;
+
+ /*
+ * Not all SR-IOV capable drivers support the
+ * spoofcheck query. Preset to -1 so the user
+ * space tool can detect that the driver didn't
+ * report anything.
+ */
+ ivi.spoofchk = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
+ break;
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf) {
+ nla_nest_cancel(skb, vfinfo);
+ goto nla_put_failure;
+ }
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate))
+ goto nla_put_failure;
+ nla_nest_end(skb, vf);
+ }
+ nla_nest_end(skb, vfinfo);
}
- if (dev->ifindex != dev->iflink) {
- u32 iflink = dev->iflink;
- RTA_PUT(skb, IFLA_LINK, sizeof(iflink), &iflink);
- }
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
- if (dev->qdisc_sleeping)
- RTA_PUT(skb, IFLA_QDISC,
- strlen(dev->qdisc_sleeping->ops->id) + 1,
- dev->qdisc_sleeping->ops->id);
-
- if (dev->master) {
- u32 master = dev->master->ifindex;
- RTA_PUT(skb, IFLA_MASTER, sizeof(master), &master);
+ if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
+ if (rtnl_link_fill(skb, dev) < 0)
+ goto nla_put_failure;
}
- if (dev->get_stats) {
- unsigned long *stats = (unsigned long*)dev->get_stats(dev);
- if (stats) {
- struct rtattr *a;
- __u32 *s;
- int i;
- int n = sizeof(struct rtnl_link_stats)/4;
+ if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
+ goto nla_put_failure;
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_link_af) {
+ struct nlattr *af;
+ int err;
+
+ if (!(af = nla_nest_start(skb, af_ops->family)))
+ goto nla_put_failure;
- a = __RTA_PUT(skb, IFLA_STATS, n*4);
- s = RTA_DATA(a);
- for (i=0; i<n; i++)
- s[i] = stats[i];
+ err = af_ops->fill_link_af(skb, dev);
+
+ /*
+ * Caller may return ENODATA to indicate that there
+ * was no data to be dumped. This is not an error, it
+ * means we should trim the attribute header and
+ * continue.
+ */
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, af);
}
}
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-nlmsg_failure:
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
+ nla_nest_end(skb, af_spec);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
}
-static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+ [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
+ [IFLA_MTU] = { .type = NLA_U32 },
+ [IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
+ [IFLA_CARRIER] = { .type = NLA_U8 },
+ [IFLA_TXQLEN] = { .type = NLA_U32 },
+ [IFLA_WEIGHT] = { .type = NLA_U32 },
+ [IFLA_OPERSTATE] = { .type = NLA_U8 },
+ [IFLA_LINKMODE] = { .type = NLA_U8 },
+ [IFLA_LINKINFO] = { .type = NLA_NESTED },
+ [IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_NET_NS_FD] = { .type = NLA_U32 },
+ [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
+ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
+ [IFLA_VF_PORTS] = { .type = NLA_NESTED },
+ [IFLA_PORT_SELF] = { .type = NLA_NESTED },
+ [IFLA_AF_SPEC] = { .type = NLA_NESTED },
+ [IFLA_EXT_MASK] = { .type = NLA_U32 },
+ [IFLA_PROMISCUITY] = { .type = NLA_U32 },
+ [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+};
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ [IFLA_INFO_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_DATA] = { .type = NLA_NESTED },
+ [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
+ [IFLA_VF_INFO] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+ [IFLA_VF_MAC] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_VLAN] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_TX_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_tx_rate) },
+ [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_link_state) },
+};
+
+static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
+ [IFLA_PORT_VF] = { .type = NLA_U32 },
+ [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
+ .len = PORT_PROFILE_MAX },
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi)},
+ [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
+ [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+};
+
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx;
- int s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ int err;
+ int hdrlen;
- read_lock(&dev_base_lock);
- for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
- continue;
- if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI) <= 0)
- break;
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rcu_read_lock();
+ cb->seq = net->dev_base_seq;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
}
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI,
+ ext_filter_mask);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err <= 0)
+ goto out;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ cb->args[0] = h;
return skb->len;
}
-static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
{
- struct ifinfomsg *ifm = NLMSG_DATA(nlh);
- struct rtattr **ida = arg;
- struct net_device *dev;
- int err, send_addr_notify = 0;
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
- if (ifm->ifi_index >= 0)
- dev = dev_get_by_index(ifm->ifi_index);
- else if (ida[IFLA_IFNAME - 1]) {
- char ifname[IFNAMSIZ];
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net;
+ /* Examine the link attributes and figure out which
+ * network namespace we are talking about.
+ */
+ if (tb[IFLA_NET_NS_PID])
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else if (tb[IFLA_NET_NS_FD])
+ net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
+ else
+ net = get_net(src_net);
+ return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
- if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1],
- IFNAMSIZ) >= IFNAMSIZ)
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+{
+ if (dev) {
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
return -EINVAL;
- dev = dev_get_by_name(ifname);
- } else
- return -EINVAL;
- if (!dev)
- return -ENODEV;
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+ }
- err = -EINVAL;
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem, err;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
- if (ifm->ifi_flags)
- dev_change_flags(dev, ifm->ifi_flags);
+ if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+ return -EAFNOSUPPORT;
- if (ida[IFLA_MAP - 1]) {
+ if (!af_ops->set_link_af)
+ return -EOPNOTSUPP;
+
+ if (af_ops->validate_link_af) {
+ err = af_ops->validate_link_af(dev, af);
+ if (err < 0)
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+{
+ int rem, err = -EINVAL;
+ struct nlattr *vf;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ nla_for_each_nested(vf, attr, rem) {
+ switch (nla_type(vf)) {
+ case IFLA_VF_MAC: {
+ struct ifla_vf_mac *ivm;
+ ivm = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ break;
+ }
+ case IFLA_VF_VLAN: {
+ struct ifla_vf_vlan *ivv;
+ ivv = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+ ivv->vlan,
+ ivv->qos);
+ break;
+ }
+ case IFLA_VF_TX_RATE: {
+ struct ifla_vf_tx_rate *ivt;
+ struct ifla_vf_info ivf;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf,
+ &ivf);
+ if (err)
+ break;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ break;
+ }
+ case IFLA_VF_RATE: {
+ struct ifla_vf_rate *ivt;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
+ break;
+ }
+ case IFLA_VF_SPOOFCHK: {
+ struct ifla_vf_spoofchk *ivs;
+ ivs = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ break;
+ }
+ case IFLA_VF_LINK_STATE: {
+ struct ifla_vf_link_state *ivl;
+ ivl = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops;
+ int err;
+
+ if (upper_dev) {
+ if (upper_dev->ifindex == ifindex)
+ return 0;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!upper_dev)
+ return -EINVAL;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
+ struct nlattr **tb, char *ifname, int modified)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
+ struct net *net = rtnl_link_get_net(dev_net(dev), tb);
+ if (IS_ERR(net)) {
+ err = PTR_ERR(net);
+ goto errout;
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto errout;
+ }
+ err = dev_change_net_namespace(dev, net, ifname);
+ put_net(net);
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_MAP]) {
struct rtnl_link_ifmap *u_map;
struct ifmap k_map;
- if (!dev->set_config) {
+ if (!ops->ndo_set_config) {
err = -EOPNOTSUPP;
- goto out;
+ goto errout;
}
if (!netif_device_present(dev)) {
err = -ENODEV;
- goto out;
+ goto errout;
}
-
- if (ida[IFLA_MAP - 1]->rta_len != RTA_LENGTH(sizeof(*u_map)))
- goto out;
-
- u_map = RTA_DATA(ida[IFLA_MAP - 1]);
+ u_map = nla_data(tb[IFLA_MAP]);
k_map.mem_start = (unsigned long) u_map->mem_start;
k_map.mem_end = (unsigned long) u_map->mem_end;
k_map.base_addr = (unsigned short) u_map->base_addr;
@@ -343,100 +1521,667 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
k_map.dma = (unsigned char) u_map->dma;
k_map.port = (unsigned char) u_map->port;
- err = dev->set_config(dev, &k_map);
+ err = ops->ndo_set_config(dev, &k_map);
+ if (err < 0)
+ goto errout;
- if (err)
- goto out;
+ modified = 1;
}
- if (ida[IFLA_ADDRESS - 1]) {
- if (!dev->set_mac_address) {
- err = -EOPNOTSUPP;
- goto out;
- }
- if (!netif_device_present(dev)) {
- err = -ENODEV;
- goto out;
- }
- if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len))
- goto out;
+ if (tb[IFLA_ADDRESS]) {
+ struct sockaddr *sa;
+ int len;
- err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1]));
+ len = sizeof(sa_family_t) + dev->addr_len;
+ sa = kmalloc(len, GFP_KERNEL);
+ if (!sa) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ sa->sa_family = dev->type;
+ memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
+ dev->addr_len);
+ err = dev_set_mac_address(dev, sa);
+ kfree(sa);
if (err)
- goto out;
- send_addr_notify = 1;
+ goto errout;
+ modified = 1;
}
- if (ida[IFLA_BROADCAST - 1]) {
- if (ida[IFLA_BROADCAST - 1]->rta_len != RTA_LENGTH(dev->addr_len))
- goto out;
- memcpy(dev->broadcast, RTA_DATA(ida[IFLA_BROADCAST - 1]),
- dev->addr_len);
- send_addr_notify = 1;
+ if (tb[IFLA_MTU]) {
+ err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ if (err < 0)
+ goto errout;
+ modified = 1;
}
- if (ida[IFLA_MTU - 1]) {
- if (ida[IFLA_MTU - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
- goto out;
- err = dev_set_mtu(dev, *((u32 *) RTA_DATA(ida[IFLA_MTU - 1])));
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ modified = 1;
+ }
+
+ /*
+ * Interface selected by interface index but interface
+ * name provided implies that a name change has been
+ * requested.
+ */
+ if (ifm->ifi_index > 0 && ifname[0]) {
+ err = dev_change_name(dev, ifname);
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_IFALIAS]) {
+ err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+ if (tb[IFLA_BROADCAST]) {
+ nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ }
+
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
if (err)
- goto out;
+ goto errout;
+ modified = 1;
+ }
+ if (tb[IFLA_CARRIER]) {
+ err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ if (err)
+ goto errout;
+ modified = 1;
}
- if (ida[IFLA_TXQLEN - 1]) {
- if (ida[IFLA_TXQLEN - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
- goto out;
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
- dev->tx_queue_len = *((u32 *) RTA_DATA(ida[IFLA_TXQLEN - 1]));
+ if (tb[IFLA_LINKMODE]) {
+ write_lock_bh(&dev_base_lock);
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ write_unlock_bh(&dev_base_lock);
}
- if (ida[IFLA_WEIGHT - 1]) {
- if (ida[IFLA_WEIGHT - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
- goto out;
+ if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *attr;
+ int rem;
+ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_INFO) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = do_setvfinfo(dev, attr);
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+ }
+ err = 0;
- dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1]));
+ if (tb[IFLA_VF_PORTS]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+ struct nlattr *attr;
+ int vf;
+ int rem;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_port)
+ goto errout;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
+ if (nla_type(attr) != IFLA_VF_PORT)
+ continue;
+ err = nla_parse_nested(port, IFLA_PORT_MAX,
+ attr, ifla_port_policy);
+ if (err < 0)
+ goto errout;
+ if (!port[IFLA_PORT_VF]) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ vf = nla_get_u32(port[IFLA_PORT_VF]);
+ err = ops->ndo_set_vf_port(dev, vf, port);
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
}
+ err = 0;
- if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) {
- char ifname[IFNAMSIZ];
+ if (tb[IFLA_PORT_SELF]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
- if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1],
- IFNAMSIZ) >= IFNAMSIZ)
- goto out;
- err = dev_change_name(dev, ifname);
- if (err)
- goto out;
+ err = nla_parse_nested(port, IFLA_PORT_MAX,
+ tb[IFLA_PORT_SELF], ifla_port_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_port)
+ err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
+ if (err < 0)
+ goto errout;
+ modified = 1;
}
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
+
+ if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+ BUG();
+
+ err = af_ops->set_link_af(dev, af);
+ if (err < 0)
+ goto errout;
+
+ modified = 1;
+ }
+ }
err = 0;
+errout:
+ if (err < 0 && modified)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+
+ return err;
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ int err;
+ struct nlattr *tb[IFLA_MAX+1];
+ char ifname[IFNAMSIZ];
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ goto errout;
+
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ goto errout;
+
+ err = do_setlink(skb, dev, ifm, tb, ifname, 0);
+errout:
+ return err;
+}
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+ LIST_HEAD(list_kill);
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->rtnl_link_ops;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ return 0;
+}
+
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+{
+ unsigned int old_flags;
+ int err;
+
+ old_flags = dev->flags;
+ if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ return err;
+ }
+
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+
+ __dev_notify_flags(dev, old_flags, ~0U);
+ return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
+struct net_device *rtnl_create_link(struct net *net,
+ char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
+{
+ int err;
+ struct net_device *dev;
+ unsigned int num_tx_queues = 1;
+ unsigned int num_rx_queues = 1;
+
+ if (tb[IFLA_NUM_TX_QUEUES])
+ num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
+ else if (ops->get_num_tx_queues)
+ num_tx_queues = ops->get_num_tx_queues();
+
+ if (tb[IFLA_NUM_RX_QUEUES])
+ num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
+ else if (ops->get_num_rx_queues)
+ num_rx_queues = ops->get_num_rx_queues();
+
+ err = -ENOMEM;
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
+ num_tx_queues, num_rx_queues);
+ if (!dev)
+ goto err;
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = ops;
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+
+ if (tb[IFLA_MTU])
+ dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ if (tb[IFLA_ADDRESS]) {
+ memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
+ dev->addr_assign_type = NET_ADDR_SET;
+ }
+ if (tb[IFLA_BROADCAST])
+ memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+ nla_len(tb[IFLA_BROADCAST]));
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+ if (tb[IFLA_LINKMODE])
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+
+ return dev;
+
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(rtnl_create_link);
+
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct nlattr **tb)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ err = do_setlink(skb, dev, ifm, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct net_device *dev;
+ struct net_device *master_dev = NULL;
+ struct ifinfomsg *ifm;
+ char kind[MODULE_NAME_LEN];
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ int err;
+
+#ifdef CONFIG_MODULES
+replay:
+#endif
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
+
+ if (dev) {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+ }
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_LINKINFO]) {
+ err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO], ifla_info_policy);
+ if (err < 0)
+ return err;
+ } else
+ memset(linkinfo, 0, sizeof(linkinfo));
+
+ if (linkinfo[IFLA_INFO_KIND]) {
+ nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ } else {
+ kind[0] = '\0';
+ ops = NULL;
+ }
+
+ if (1) {
+ struct nlattr *attr[ops ? ops->maxtype + 1 : 0];
+ struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0];
+ struct nlattr **data = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net *dest_net;
+
+ if (ops) {
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ err = nla_parse_nested(attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy);
+ if (err < 0)
+ return err;
+ data = attr;
+ }
+ if (ops->validate) {
+ err = ops->validate(tb, data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (m_ops) {
+ if (m_ops->slave_maxtype &&
+ linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ err = nla_parse_nested(slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy);
+ if (err < 0)
+ return err;
+ slave_data = slave_attr;
+ }
+ if (m_ops->slave_validate) {
+ err = m_ops->slave_validate(tb, slave_data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (dev) {
+ int modified = 0;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops ||
+ !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data);
+ if (err < 0)
+ return err;
+ modified = 1;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ err = m_ops->slave_changelink(master_dev, dev,
+ tb, slave_data);
+ if (err < 0)
+ return err;
+ modified = 1;
+ }
+
+ return do_setlink(skb, dev, ifm, tb, ifname, modified);
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(skb, net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, tb);
+ return -ENODEV;
+ }
+
+ if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+#ifdef CONFIG_MODULES
+ if (kind[0]) {
+ __rtnl_unlock();
+ request_module("rtnl-link-%s", kind);
+ rtnl_lock();
+ ops = rtnl_link_ops_get(kind);
+ if (ops)
+ goto replay;
+ }
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (!ifname[0])
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+
+ dest_net = rtnl_link_get_net(net, tb);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
+ dev = rtnl_create_link(dest_net, ifname, ops, tb);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink) {
+ err = ops->newlink(net, dev, tb, data);
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(dev);
+ goto out;
+ }
+ } else {
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+ }
+ err = rtnl_configure_link(dev, ifm);
+ if (err < 0)
+ unregister_netdevice(dev);
out:
- if (send_addr_notify)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ put_net(dest_net);
+ return err;
+ }
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct sk_buff *nskb;
+ int err;
+ u32 ext_filter_mask = 0;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ return -EINVAL;
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+ if (nskb == NULL)
+ return -ENOBUFS;
+
+ err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
- dev_put(dev);
return err;
}
-static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ if (!ext_filter_mask)
+ return NLMSG_GOODSIZE;
+ /*
+ * traverse the list of net devices and compute the minimum
+ * buffer size based upon the filter mask.
+ */
+ list_for_each_entry(dev, &net->dev_base_head, dev_list) {
+ min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
+ if_nlmsg_size(dev,
+ ext_filter_mask));
+ }
+
+ return min_ifinfo_dump_size;
+}
+
+static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
int s_idx = cb->family;
if (s_idx == 0)
s_idx = 1;
- for (idx=1; idx<NPROTO; idx++) {
+ for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
int type = cb->nlh->nlmsg_type-RTM_BASE;
if (idx < s_idx || idx == PF_PACKET)
continue;
- if (rtnetlink_links[idx] == NULL ||
- rtnetlink_links[idx][type].dumpit == NULL)
+ if (rtnl_msg_handlers[idx] == NULL ||
+ rtnl_msg_handlers[idx][type].dumpit == NULL)
continue;
- if (idx > s_idx)
+ if (idx > s_idx) {
memset(&cb->args[0], 0, sizeof(cb->args));
- if (rtnetlink_links[idx][type].dumpit(skb, cb))
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
+ if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
break;
}
cb->family = idx;
@@ -444,178 +2189,742 @@ static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ size_t if_info_size;
+
+ skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
+ if (skb == NULL)
+ goto errout;
+
+ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+EXPORT_SYMBOL(rtmsg_ifinfo);
+
+static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ u8 *addr, u32 pid, u32 seq,
+ int type, unsigned int flags,
+ int nlflags)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = flags;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+}
+
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
{
+ struct net *net = dev_net(dev);
struct sk_buff *skb;
- int size = NLMSG_SPACE(sizeof(struct ifinfomsg) +
- sizeof(struct rtnl_link_ifmap) +
- sizeof(struct rtnl_link_stats) + 128);
+ int err = -ENOBUFS;
- skb = alloc_skb(size, GFP_KERNEL);
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
if (!skb)
- return;
+ goto errout;
- if (rtnetlink_fill_ifinfo(skb, dev, type, current->pid, 0, change, 0) < 0) {
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);
+ if (err < 0) {
kfree_skb(skb);
- return;
+ goto errout;
}
- NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-/* Protected by RTNL sempahore. */
-static struct rtattr **rta_buf;
-static int rtattr_max;
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
-/* Process one rtnetlink message. */
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
-static __inline__ int
-rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct rtnetlink_link *link;
- struct rtnetlink_link *link_tab;
- int sz_idx, kind;
- int min_len;
- int family;
- int type;
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ u8 *addr;
int err;
- /* Only requests are handled by kernel now */
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
- type = nlh->nlmsg_type;
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
- /* A control message: ignore them */
- if (type < RTM_BASE)
- return 0;
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
- /* Unknown message: reply with EINVAL */
- if (type > RTM_MAX)
- goto err_inval;
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+ return -EINVAL;
+ }
- type -= RTM_BASE;
+ addr = nla_data(tb[NDA_LLADDR]);
- /* All the messages must have at least 1 byte length */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
- return 0;
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
- family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
- if (family >= NPROTO) {
- *errp = -EAFNOSUPPORT;
- return -1;
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
}
- link_tab = rtnetlink_links[family];
- if (link_tab == NULL)
- link_tab = rtnetlink_links[PF_UNSPEC];
- link = &link_tab[type];
+ /* Embedded bridge, macvlan, and any other device support */
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
- sz_idx = type>>2;
- kind = type&3;
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr)
+{
+ int err = -EOPNOTSUPP;
- if (kind != 2 && security_netlink_recv(skb)) {
- *errp = -EPERM;
- return -1;
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (!(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return -EINVAL;
}
- if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
- if (link->dumpit == NULL)
- link = &(rtnetlink_links[PF_UNSPEC][type]);
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+ else
+ err = -EINVAL;
- if (link->dumpit == NULL)
- goto err_inval;
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ int err = -EINVAL;
+ __u8 *addr;
- if ((*errp = netlink_dump_start(rtnl, skb, nlh,
- link->dumpit, NULL)) != 0) {
- return -1;
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if (ndm->ndm_flags & NTF_SELF) {
+ if (dev->netdev_ops->ndo_fdb_del)
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
}
+ }
+out:
+ return err;
+}
- netlink_queue_skip(nlh, skb);
- return -1;
+static int nlmsg_populate_fdb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int *idx,
+ struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+ u32 portid, seq;
+
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (*idx < cb->args[0])
+ goto skip;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
+ portid, seq,
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI);
+ if (err < 0)
+ return err;
+skip:
+ *idx += 1;
}
+ return 0;
+}
+
+/**
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
+ * @nlh: netlink message header
+ * @dev: netdevice
+ *
+ * Default netdevice operation to dump the existing unicast address list.
+ * Returns number of addresses from list put in skb.
+ */
+int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int idx)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
+ if (err)
+ goto out;
+ nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
+out:
+ netif_addr_unlock_bh(dev);
+ return idx;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_dump);
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ struct net_device *br_dev;
+ const struct net_device_ops *ops;
+
+ br_dev = netdev_master_upper_dev_get(dev);
+ ops = br_dev->netdev_ops;
+ if (ops->ndo_fdb_dump)
+ idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
+ else
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct nlattr *br_afspec;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_BRIDGE;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = 0;
+
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (br_dev &&
+ nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev->iflink &&
+ nla_put_u32(skb, IFLA_LINK, dev->iflink)))
+ goto nla_put_failure;
+
+ br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!br_afspec)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
+ nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, br_afspec);
+
+ return nlmsg_end(skb, nlh);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx = 0;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ u32 seq = cb->nlh->nlmsg_seq;
+ struct nlattr *extfilt;
+ u32 filter_mask = 0;
+
+ extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+ IFLA_EXT_MASK);
+ if (extfilt)
+ filter_mask = nla_get_u32(extfilt);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev, filter_mask) < 0)
+ break;
+ idx++;
+ }
+
+ if (ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ ops->ndo_bridge_getlink(skb, portid, seq, dev,
+ filter_mask) < 0)
+ break;
+ idx++;
+ }
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static inline size_t bridge_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_MTU */
+ + nla_total_size(sizeof(u32)) /* IFLA_LINK */
+ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */
+ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */
+ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */
+ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
+{
+ struct net *net = dev_net(dev);
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ struct sk_buff *skb;
+ int err = -EOPNOTSUPP;
+
+ skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) &&
+ br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ if (err < 0)
+ goto errout;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF) &&
+ dev->netdev_ops->ndo_bridge_getlink) {
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+ if (err < 0)
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return 0;
+errout:
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return err;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 oflags, flags = 0;
+ bool have_flags = false;
- memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
- min_len = rtm_min[sz_idx];
- if (nlh->nlmsg_len < min_len)
- goto err_inval;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
- if (nlh->nlmsg_len > min_len) {
- int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
- struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
- while (RTA_OK(attr, attrlen)) {
- unsigned flavor = attr->rta_type;
- if (flavor) {
- if (flavor > rta_max[sz_idx])
- goto err_inval;
- rta_buf[flavor-1] = attr;
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
}
- attr = RTA_NEXT(attr, attrlen);
}
}
- if (link->doit == NULL)
- link = &(rtnetlink_links[PF_UNSPEC][type]);
- if (link->doit == NULL)
- goto err_inval;
- err = link->doit(skb, nlh, (void *)&rta_buf[0]);
+ oflags = flags;
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_setlink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
- *errp = err;
+ if (!err)
+ flags &= ~BRIDGE_FLAGS_SELF;
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+ /* Generate event to notify upper layer of bridge change */
+ if (!err)
+ err = rtnl_bridge_notify(dev, oflags);
+out:
return err;
+}
+
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 oflags, flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
-err_inval:
- *errp = -EINVAL;
- return -1;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ oflags = flags;
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_dellink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+
+ if (!err)
+ flags &= ~BRIDGE_FLAGS_SELF;
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+ /* Generate event to notify upper layer of bridge change */
+ if (!err)
+ err = rtnl_bridge_notify(dev, oflags);
+out:
+ return err;
}
-static void rtnetlink_rcv(struct sock *sk, int len)
+/* Process one rtnetlink message. */
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- unsigned int qlen = 0;
+ struct net *net = sock_net(skb->sk);
+ rtnl_doit_func doit;
+ int sz_idx, kind;
+ int family;
+ int type;
+ int err;
- do {
+ type = nlh->nlmsg_type;
+ if (type > RTM_MAX)
+ return -EOPNOTSUPP;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
+ return 0;
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+ sz_idx = type>>2;
+ kind = type&3;
+
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ struct sock *rtnl;
+ rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
+ u16 min_dump_alloc = 0;
+
+ dumpit = rtnl_get_dumpit(family, type);
+ if (dumpit == NULL)
+ return -EOPNOTSUPP;
+ calcit = rtnl_get_calcit(family, type);
+ if (calcit)
+ min_dump_alloc = calcit(skb, nlh);
+
+ __rtnl_unlock();
+ rtnl = net->rtnl;
+ {
+ struct netlink_dump_control c = {
+ .dump = dumpit,
+ .min_dump_alloc = min_dump_alloc,
+ };
+ err = netlink_dump_start(rtnl, skb, nlh, &c);
+ }
rtnl_lock();
- netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg);
- up(&rtnl_sem);
+ return err;
+ }
+
+ doit = rtnl_get_doit(family, type);
+ if (doit == NULL)
+ return -EOPNOTSUPP;
- netdev_run_todo();
- } while (qlen);
+ return doit(skb, nlh);
}
-static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
+static void rtnetlink_rcv(struct sk_buff *skb)
{
- [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
- [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
- [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
- [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
- [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
- [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
- [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
- [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
- [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info },
- [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set },
-};
+ rtnl_lock();
+ netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+ rtnl_unlock();
+}
static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
switch (event) {
- case NETDEV_UNREGISTER:
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
- break;
- case NETDEV_REGISTER:
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
- break;
case NETDEV_UP:
case NETDEV_DOWN:
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
- break;
+ case NETDEV_PRE_UP:
+ case NETDEV_POST_INIT:
+ case NETDEV_REGISTER:
case NETDEV_CHANGE:
+ case NETDEV_PRE_TYPE_CHANGE:
case NETDEV_GOING_DOWN:
+ case NETDEV_UNREGISTER:
+ case NETDEV_UNREGISTER_FINAL:
+ case NETDEV_RELEASE:
+ case NETDEV_JOIN:
break;
default:
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
break;
}
return NOTIFY_DONE;
@@ -625,35 +2934,57 @@ static struct notifier_block rtnetlink_dev_notifier = {
.notifier_call = rtnetlink_event,
};
-void __init rtnetlink_init(void)
+
+static int __net_init rtnetlink_net_init(struct net *net)
{
- int i;
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .groups = RTNLGRP_MAX,
+ .input = rtnetlink_rcv,
+ .cb_mutex = &rtnl_mutex,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
+ if (!sk)
+ return -ENOMEM;
+ net->rtnl = sk;
+ return 0;
+}
- rtattr_max = 0;
- for (i = 0; i < ARRAY_SIZE(rta_max); i++)
- if (rta_max[i] > rtattr_max)
- rtattr_max = rta_max[i];
- rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
- if (!rta_buf)
- panic("rtnetlink_init: cannot allocate rta_buf\n");
+static void __net_exit rtnetlink_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->rtnl);
+ net->rtnl = NULL;
+}
- rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
- THIS_MODULE);
- if (rtnl == NULL)
+static struct pernet_operations rtnetlink_net_ops = {
+ .init = rtnetlink_net_init,
+ .exit = rtnetlink_net_exit,
+};
+
+void __init rtnetlink_init(void)
+{
+ if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
- netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
+
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table;
- rtnetlink_links[PF_PACKET] = link_rtnetlink_table;
+
+ rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
+ rtnl_dump_ifinfo, rtnl_calcit);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
+
+ rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+
+ rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
}
-EXPORT_SYMBOL(__rta_fill);
-EXPORT_SYMBOL(rtattr_strlcpy);
-EXPORT_SYMBOL(rtattr_parse);
-EXPORT_SYMBOL(rtnetlink_links);
-EXPORT_SYMBOL(rtnetlink_put_metrics);
-EXPORT_SYMBOL(rtnl);
-EXPORT_SYMBOL(rtnl_lock);
-EXPORT_SYMBOL(rtnl_lock_interruptible);
-EXPORT_SYMBOL(rtnl_sem);
-EXPORT_SYMBOL(rtnl_unlock);
diff --git a/net/core/scm.c b/net/core/scm.c
index 649d01ef35b..b442e7e25e6 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -24,8 +24,11 @@
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
+#include <linux/pid_namespace.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/protocol.h>
@@ -33,20 +36,29 @@
#include <net/sock.h>
#include <net/compat.h>
#include <net/scm.h>
+#include <net/cls_cgroup.h>
/*
- * Only allow a user to send credentials, that they could set with
+ * Only allow a user to send credentials, that they could set with
* setu(g)id.
*/
static __inline__ int scm_check_creds(struct ucred *creds)
{
- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) &&
- ((creds->uid == current->uid || creds->uid == current->euid ||
- creds->uid == current->suid) || capable(CAP_SETUID)) &&
- ((creds->gid == current->gid || creds->gid == current->egid ||
- creds->gid == current->sgid) || capable(CAP_SETGID))) {
+ const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
+
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
return 0;
}
return -EPERM;
@@ -74,22 +86,23 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
+ fpl->max = SCM_MAX_FD;
}
fpp = &fpl->fp[fpl->count];
- if (fpl->count + num > SCM_MAX_FD)
+ if (fpl->count + num > fpl->max)
return -EINVAL;
-
+
/*
* Verify the descriptors and increment the usage count.
*/
-
+
for (i=0; i< num; i++)
{
int fd = fdp[i];
struct file *file;
- if (fd < 0 || !(file = fget(fd)))
+ if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
*fpp++ = file;
fpl->count++;
@@ -109,6 +122,7 @@ void __scm_destroy(struct scm_cookie *scm)
kfree(fpl);
}
}
+EXPORT_SYMBOL(__scm_destroy);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
@@ -123,7 +137,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
/* The first check was omitted in <= 2.2.5. The reasoning was
that parser checks cmsg_len in any case, so that
additional check would be work duplication.
- But if cmsg_level is not SOL_SOCKET, we do not check
+ But if cmsg_level is not SOL_SOCKET, we do not check
for too short ancillary data object at all! Oops.
OK, let's add it...
*/
@@ -136,18 +150,45 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
+ if (!sock->ops || sock->ops->family != PF_UNIX)
+ goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
goto error;
break;
case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
- memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
- err = scm_check_creds(&p->creds);
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
if (err)
goto error;
+
+ p->creds.pid = creds.pid;
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
+ struct pid *pid;
+ err = -ESRCH;
+ pid = find_get_pid(creds.pid);
+ if (!pid)
+ goto error;
+ put_pid(p->pid);
+ p->pid = pid;
+ }
+
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
+
+ p->creds.uid = uid;
+ p->creds.gid = gid;
break;
+ }
default:
goto error;
}
@@ -159,15 +200,17 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
p->fp = NULL;
}
return 0;
-
+
error:
scm_destroy(p);
return err;
}
+EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
- struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control;
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user *)msg->msg_control;
struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
int err;
@@ -189,20 +232,24 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
err = -EFAULT;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
- goto out;
+ goto out;
if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
goto out;
cmlen = CMSG_SPACE(len);
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
err = 0;
out:
return err;
}
+EXPORT_SYMBOL(put_cmsg);
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
{
- struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control;
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user*)msg->msg_control;
int fdmax = 0;
int fdnum = scm->fp->count;
@@ -222,13 +269,16 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
if (fdnum < fdmax)
fdmax = fdnum;
- for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++)
+ for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+ i++, cmfptr++)
{
+ struct socket *sock;
int new_fd;
err = security_file_receive(fp[i]);
if (err)
break;
- err = get_unused_fd();
+ err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+ ? O_CLOEXEC : 0);
if (err < 0)
break;
new_fd = err;
@@ -238,15 +288,18 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
break;
}
/* Bump the usage count and install the file. */
- get_file(fp[i]);
- fd_install(new_fd, fp[i]);
+ sock = sock_from_file(fp[i], &err);
+ if (sock) {
+ sock_update_netprioidx(sock->sk);
+ sock_update_classid(sock->sk);
+ }
+ fd_install(new_fd, get_file(fp[i]));
}
if (i > 0)
{
int cmlen = CMSG_LEN(i*sizeof(int));
- if (!err)
- err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
@@ -266,6 +319,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
*/
__scm_destroy(scm);
}
+EXPORT_SYMBOL(scm_detach_fds);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
{
@@ -275,17 +329,13 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (!fpl)
return NULL;
- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
+ new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+ GFP_KERNEL);
if (new_fpl) {
- for (i=fpl->count-1; i>=0; i--)
+ for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
- memcpy(new_fpl, fpl, sizeof(*fpl));
+ new_fpl->max = new_fpl->count;
}
return new_fpl;
}
-
-EXPORT_SYMBOL(__scm_destroy);
-EXPORT_SYMBOL(__scm_send);
-EXPORT_SYMBOL(put_cmsg);
-EXPORT_SYMBOL(scm_detach_fds);
EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 00000000000..ba71212f025
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,173 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cryptohash.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/net.h>
+
+#include <net/secure_seq.h>
+
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
+
+static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+
+static __always_inline void net_secret_init(void)
+{
+ net_get_random_once(net_secret, sizeof(net_secret));
+}
+#endif
+
+#ifdef CONFIG_INET
+static u32 seq_scale(u32 seq)
+{
+ /*
+ * As close as possible to RFC 793, which
+ * suggests using a 250 kHz clock.
+ * Further reading shows this assumes 2 Mb/s networks.
+ * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+ * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+ * we also need to limit the resolution so that the u32 seq
+ * overlaps less than one time per MSL (2 minutes).
+ * Choosing a clock of 64 ns period is OK. (period of 274 s)
+ */
+ return seq + (ktime_to_ns(ktime_get_real()) >> 6);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32)daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return seq_scale(hash[0]);
+}
+EXPORT_SYMBOL(secure_tcpv6_sequence_number);
+
+u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+ __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32) daddr[i];
+ secret[4] = net_secret[4] + (__force u32)dport;
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
+#endif
+
+#ifdef CONFIG_INET
+
+__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return seq_scale(hash[0]);
+}
+
+u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = (__force u32)dport ^ net_secret[14];
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
+
+#if IS_ENABLED(CONFIG_IP_DCCP)
+u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_to_ns(ktime_get_real());
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+
+#if IS_ENABLED(CONFIG_IPV6)
+u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_to_ns(ktime_get_real());
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccpv6_sequence_number);
+#endif
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d0732e9c856..c1a33033cbe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1,11 +1,9 @@
/*
* Routines having to do with the 'struct sk_buff' memory handlers.
*
- * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
* Florian La Roche <rzsfl@rz.uni-sb.de>
*
- * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
- *
* Fixes:
* Alan Cox : Fixed the worst of the load
* balancer bugs.
@@ -38,78 +36,114 @@
* The functions in this file will not compile correctly with gcc 2.4.x
*/
-#include <linux/config.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/kmemcheck.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+#include <linux/errqueue.h>
+#include <linux/prefetch.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
-
-static kmem_cache_t *skbuff_head_cache __read_mostly;
-static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+#include <trace/events/skb.h>
+#include <linux/highmem.h>
-/*
- * Keep out-of-line to prevent kernel bloat.
- * __builtin_return_address is not used because it is not always
- * reliable.
- */
+struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
/**
- * skb_over_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
+ * skb_panic - private function for out-of-line support
+ * @skb: buffer
+ * @sz: size
+ * @addr: address
+ * @msg: skb_over_panic or skb_under_panic
*
- * Out of line support code for skb_put(). Not user callable.
+ * Out-of-line support for skb_put() and skb_push().
+ * Called via the wrapper skb_over_panic() or skb_under_panic().
+ * Keep out of line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always reliable.
*/
-void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
+ const char msg[])
{
- printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%p end:%p dev:%s\n",
- here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ msg, addr, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
BUG();
}
-/**
- * skb_under_panic - private function
- * @skb: buffer
- * @sz: size
- * @here: address
- *
- * Out of line support code for skb_push(). Not user callable.
+static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
*/
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
-void skb_under_panic(struct sk_buff *skb, int sz, void *here)
+static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
+ unsigned long ip, bool *pfmemalloc)
{
- printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
- "data:%p tail:%p end:%p dev:%s\n",
- here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
- skb->dev ? skb->dev->name : "<NULL>");
- BUG();
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
}
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
@@ -118,150 +152,343 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
*
*/
+struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
+{
+ struct sk_buff *skb;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->head = NULL;
+ skb->truesize = sizeof(struct sk_buff);
+ atomic_set(&skb->users, 1);
+
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+out:
+ return skb;
+}
+
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone)
+ int flags, int node)
{
+ struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
- skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
if (!skb)
goto out;
+ prefetchw(skb);
- /* Get the DATA. Size must match skb_add_mtu(). */
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
size = SKB_DATA_ALIGN(size);
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
if (!data)
goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
+ prefetchw(data + size);
- memset(skb, 0, offsetof(struct sk_buff, truesize));
- skb->truesize = size + sizeof(struct sk_buff);
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ /* Account for allocated memory : skb + skb->head */
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
- skb->tail = data;
- skb->end = data + size;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
- shinfo->nr_frags = 0;
- shinfo->tso_size = 0;
- shinfo->tso_segs = 0;
- shinfo->ufo_size = 0;
- shinfo->ip6_frag_id = 0;
- shinfo->frag_list = NULL;
-
- if (fclone) {
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ kmemcheck_annotate_bitfield(child, flags1);
+ kmemcheck_annotate_bitfield(child, flags2);
skb->fclone = SKB_FCLONE_ORIG;
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
+ child->pfmemalloc = pfmemalloc;
}
out:
return skb;
nodata:
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(cache, skb);
skb = NULL;
goto out;
}
+EXPORT_SYMBOL(__alloc_skb);
/**
- * alloc_skb_from_cache - allocate a network buffer
- * @cp: kmem_cache from which to allocate the data area
- * (object size must be big enough for @size bytes + skb overheads)
- * @size: size to allocate
- * @gfp_mask: allocation mask
- *
- * Allocate a new &sk_buff. The returned buffer has no headroom and
- * tail room of size bytes. The object has a reference count of one.
- * The return is the buffer. On a failure the return is %NULL.
+ * build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
*
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc() only if
+ * @frag_size is 0, otherwise data should come from the page allocator.
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
- unsigned int size,
- gfp_t gfp_mask)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
+ struct skb_shared_info *shinfo;
struct sk_buff *skb;
- u8 *data;
+ unsigned int size = frag_size ? : ksize(data);
- /* Get the HEAD */
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
- goto out;
+ return NULL;
- /* Get the DATA. */
- size = SKB_DATA_ALIGN(size);
- data = kmem_cache_alloc(cp, gfp_mask);
- if (!data)
- goto nodata;
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- memset(skb, 0, offsetof(struct sk_buff, truesize));
- skb->truesize = size + sizeof(struct sk_buff);
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
- skb->tail = data;
- skb->end = data + size;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->tso_size = 0;
- skb_shinfo(skb)->tso_segs = 0;
- skb_shinfo(skb)->frag_list = NULL;
-out:
return skb;
-nodata:
- kmem_cache_free(skbuff_head_cache, skb);
- skb = NULL;
- goto out;
}
+EXPORT_SYMBOL(build_skb);
+struct netdev_alloc_cache {
+ struct page_frag frag;
+ /* we maintain a pagecount bias, so that we dont dirty cache line
+ * containing page->_count every time we allocate a fragment.
+ */
+ unsigned int pagecnt_bias;
+};
+static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static void skb_drop_fraglist(struct sk_buff *skb)
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ struct netdev_alloc_cache *nc;
+ void *data = NULL;
+ int order;
+ unsigned long flags;
- skb_shinfo(skb)->frag_list = NULL;
+ local_irq_save(flags);
+ nc = &__get_cpu_var(netdev_alloc_cache);
+ if (unlikely(!nc->frag.page)) {
+refill:
+ for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
+ gfp_t gfp = gfp_mask;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN;
+ nc->frag.page = alloc_pages(gfp, order);
+ if (likely(nc->frag.page))
+ break;
+ if (--order < 0)
+ goto end;
+ }
+ nc->frag.size = PAGE_SIZE << order;
+recycle:
+ atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
+ nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+ nc->frag.offset = 0;
+ }
- do {
- struct sk_buff *this = list;
- list = list->next;
- kfree_skb(this);
- } while (list);
+ if (nc->frag.offset + fragsz > nc->frag.size) {
+ /* avoid unnecessary locked operations if possible */
+ if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
+ atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
+ goto recycle;
+ goto refill;
+ }
+
+ data = page_address(nc->frag.page) + nc->frag.offset;
+ nc->frag.offset += fragsz;
+ nc->pagecnt_bias--;
+end:
+ local_irq_restore(flags);
+ return data;
+}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(netdev_alloc_frag);
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has unspecified headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb = NULL;
+ unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ void *data;
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = __netdev_alloc_frag(fragsz, gfp_mask);
+
+ if (likely(data)) {
+ skb = build_skb(data, fragsz);
+ if (unlikely(!skb))
+ put_page(virt_to_head_page(data));
+ }
+ } else {
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
+ }
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+ int size, unsigned int truesize)
+{
+ skb_fill_page_desc(skb, i, page, off, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
+{
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+ kfree_skb_list(*listp);
+ *listp = NULL;
+}
+
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+ skb_drop_list(&skb_shinfo(skb)->frag_list);
}
static void skb_clone_fraglist(struct sk_buff *skb)
{
struct sk_buff *list;
- for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
+ skb_walk_frags(skb, list)
skb_get(list);
}
-void skb_release_data(struct sk_buff *skb)
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
+static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
@@ -269,25 +496,36 @@ void skb_release_data(struct sk_buff *skb)
if (skb_shinfo(skb)->nr_frags) {
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
+ }
+
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = skb_shinfo(skb)->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
}
- if (skb_shinfo(skb)->frag_list)
+ if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
- kfree(skb->head);
+ skb_free_head(skb);
}
}
/*
* Free an skbuff by memory without cleaning the state.
*/
-void kfree_skbmem(struct sk_buff *skb)
+static void kfree_skbmem(struct sk_buff *skb)
{
struct sk_buff *other;
atomic_t *fclone_ref;
- skb_release_data(skb);
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
@@ -311,21 +549,12 @@ void kfree_skbmem(struct sk_buff *skb)
if (atomic_dec_and_test(fclone_ref))
kmem_cache_free(skbuff_fclone_cache, other);
break;
- };
+ }
}
-/**
- * __kfree_skb - private function
- * @skb: buffer
- *
- * Free an sk_buff. Release anything attached to the buffer.
- * Clean the state. This is an internal helper function. Users should
- * always call kfree_skb
- */
-
-void __kfree_skb(struct sk_buff *skb)
+static void skb_release_head_state(struct sk_buff *skb)
{
- dst_release(skb->dst);
+ skb_dst_drop(skb);
#ifdef CONFIG_XFRM
secpath_put(skb->sp);
#endif
@@ -333,15 +562,12 @@ void __kfree_skb(struct sk_buff *skb)
WARN_ON(in_irq());
skb->destructor(skb);
}
-#ifdef CONFIG_NETFILTER
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- nf_conntrack_put_reasm(skb->nfct_reasm);
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(skb->nf_bridge);
#endif
-#endif
/* XXX: IS this still necessary? - JHS */
#ifdef CONFIG_NET_SCHED
skb->tc_index = 0;
@@ -349,9 +575,268 @@ void __kfree_skb(struct sk_buff *skb)
skb->tc_verd = 0;
#endif
#endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+ skb_release_head_state(skb);
+ if (likely(skb->head))
+ skb_release_data(skb);
+}
+
+/**
+ * __kfree_skb - private function
+ * @skb: buffer
+ *
+ * Free an sk_buff. Release anything attached to the buffer.
+ * Clean the state. This is an internal helper function. Users should
+ * always call kfree_skb
+ */
+void __kfree_skb(struct sk_buff *skb)
+{
+ skb_release_all(skb);
kfree_skbmem(skb);
}
+EXPORT_SYMBOL(__kfree_skb);
+
+/**
+ * kfree_skb - free an sk_buff
+ * @skb: buffer to free
+ *
+ * Drop a reference to the buffer and free it if the usage count has
+ * hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ trace_kfree_skb(skb, __builtin_return_address(0));
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb);
+
+void kfree_skb_list(struct sk_buff *segs)
+{
+ while (segs) {
+ struct sk_buff *next = segs->next;
+
+ kfree_skb(segs);
+ segs = next;
+ }
+}
+EXPORT_SYMBOL(kfree_skb_list);
+
+/**
+ * skb_tx_error - report an sk_buff xmit error
+ * @skb: buffer that triggered an error
+ *
+ * Report xmit error if a device callback is tracking this skb.
+ * skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = skb_shinfo(skb)->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, false);
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ }
+}
+EXPORT_SYMBOL(skb_tx_error);
+
+/**
+ * consume_skb - free an skbuff
+ * @skb: buffer to free
+ *
+ * Drop a ref to the buffer and free it if the usage count has hit zero
+ * Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ * is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ trace_consume_skb(skb);
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ new->tstamp = old->tstamp;
+ new->dev = old->dev;
+ new->transport_header = old->transport_header;
+ new->network_header = old->network_header;
+ new->mac_header = old->mac_header;
+ new->inner_protocol = old->inner_protocol;
+ new->inner_transport_header = old->inner_transport_header;
+ new->inner_network_header = old->inner_network_header;
+ new->inner_mac_header = old->inner_mac_header;
+ skb_dst_copy(new, old);
+ skb_copy_hash(new, old);
+ new->ooo_okay = old->ooo_okay;
+ new->no_fcs = old->no_fcs;
+ new->encapsulation = old->encapsulation;
+ new->encap_hdr_csum = old->encap_hdr_csum;
+ new->csum_valid = old->csum_valid;
+ new->csum_complete_sw = old->csum_complete_sw;
+#ifdef CONFIG_XFRM
+ new->sp = secpath_get(old->sp);
+#endif
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ new->csum = old->csum;
+ new->ignore_df = old->ignore_df;
+ new->pkt_type = old->pkt_type;
+ new->ip_summed = old->ip_summed;
+ skb_copy_queue_mapping(new, old);
+ new->priority = old->priority;
+#if IS_ENABLED(CONFIG_IP_VS)
+ new->ipvs_property = old->ipvs_property;
+#endif
+ new->pfmemalloc = old->pfmemalloc;
+ new->protocol = old->protocol;
+ new->mark = old->mark;
+ new->skb_iif = old->skb_iif;
+ __nf_copy(new, old);
+#ifdef CONFIG_NET_SCHED
+ new->tc_index = old->tc_index;
+#ifdef CONFIG_NET_CLS_ACT
+ new->tc_verd = old->tc_verd;
+#endif
+#endif
+ new->vlan_proto = old->vlan_proto;
+ new->vlan_tci = old->vlan_tci;
+
+ skb_copy_secmark(new, old);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ new->napi_id = old->napi_id;
+#endif
+}
+
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+ n->sk = NULL;
+ __copy_skb_header(n, skb);
+
+ C(len);
+ C(data_len);
+ C(mac_len);
+ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ n->cloned = 1;
+ n->nohdr = 0;
+ n->destructor = NULL;
+ C(tail);
+ C(end);
+ C(head);
+ C(head_frag);
+ C(data);
+ C(truesize);
+ atomic_set(&n->users, 1);
+
+ atomic_inc(&(skb_shinfo(skb)->dataref));
+ skb->cloned = 1;
+
+ return n;
+#undef C
+}
+
+/**
+ * skb_morph - morph one skb into another
+ * @dst: the skb to receive the contents
+ * @src: the skb to supply the contents
+ *
+ * This is identical to skb_clone except that the target skb is
+ * supplied by the user.
+ *
+ * The target skb is returned upon exit.
+ */
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
+{
+ skb_release_all(dst);
+ return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
+ * @skb: the skb to modify
+ * @gfp_mask: allocation priority
+ *
+ * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * It will copy all frags into kernel and drop the reference
+ * to userspace pages.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ *
+ * Returns 0 on success or a negative error code on failure
+ * to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int i;
+ int num_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page, *head = NULL;
+ struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+
+ for (i = 0; i < num_frags; i++) {
+ u8 *vaddr;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+ page = alloc_page(gfp_mask);
+ if (!page) {
+ while (head) {
+ struct page *next = (struct page *)page_private(head);
+ put_page(head);
+ head = next;
+ }
+ return -ENOMEM;
+ }
+ vaddr = kmap_atomic(skb_frag_page(f));
+ memcpy(page_address(page),
+ vaddr + f->page_offset, skb_frag_size(f));
+ kunmap_atomic(vaddr);
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ /* skb frags release userspace buffers */
+ for (i = 0; i < num_frags; i++)
+ skb_frag_unref(skb, i);
+
+ uarg->callback(uarg, false);
+
+ /* skb frags point to kernel buffers */
+ for (i = num_frags - 1; i >= 0; i--) {
+ __skb_fill_page_desc(skb, i, head, 0,
+ skb_shinfo(skb)->frags[i].size);
+ head = (struct page *)page_private(head);
+ }
+
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
/**
* skb_clone - duplicate an sk_buff
@@ -371,6 +856,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *n;
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
+
n = skb + 1;
if (skb->fclone == SKB_FCLONE_ORIG &&
n->fclone == SKB_FCLONE_UNAVAILABLE) {
@@ -378,133 +866,51 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
} else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
+
+ kmemcheck_annotate_bitfield(n, flags1);
+ kmemcheck_annotate_bitfield(n, flags2);
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
-#define C(x) n->x = skb->x
-
- n->next = n->prev = NULL;
- n->sk = NULL;
- C(tstamp);
- C(dev);
- C(h);
- C(nh);
- C(mac);
- C(dst);
- dst_clone(skb->dst);
- C(sp);
-#ifdef CONFIG_INET
- secpath_get(skb->sp);
-#endif
- memcpy(n->cb, skb->cb, sizeof(skb->cb));
- C(len);
- C(data_len);
- C(csum);
- C(local_df);
- n->cloned = 1;
- n->nohdr = 0;
- C(pkt_type);
- C(ip_summed);
- C(priority);
- C(protocol);
- n->destructor = NULL;
-#ifdef CONFIG_NETFILTER
- C(nfmark);
- C(nfct);
- nf_conntrack_get(skb->nfct);
- C(nfctinfo);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- C(nfct_reasm);
- nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
- C(ipvs_property);
-#endif
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- C(nfct_reasm);
- nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- C(nf_bridge);
- nf_bridge_get(skb->nf_bridge);
-#endif
-#endif /*CONFIG_NETFILTER*/
-#ifdef CONFIG_NET_SCHED
- C(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
- n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
- n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
- n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
- C(input_dev);
-#endif
+ return __skb_clone(n, skb);
+}
+EXPORT_SYMBOL(skb_clone);
-#endif
- C(truesize);
- atomic_set(&n->users, 1);
- C(head);
- C(data);
- C(tail);
- C(end);
+static void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+ /* Only adjust this if it actually is csum_start rather than csum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += off;
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
+}
- atomic_inc(&(skb_shinfo(skb)->dataref));
- skb->cloned = 1;
+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ __copy_skb_header(new, old);
- return n;
+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
- /*
- * Shift between the two data areas in bytes
- */
- unsigned long offset = new->data - old->data;
-
- new->sk = NULL;
- new->dev = old->dev;
- new->priority = old->priority;
- new->protocol = old->protocol;
- new->dst = dst_clone(old->dst);
-#ifdef CONFIG_INET
- new->sp = secpath_get(old->sp);
-#endif
- new->h.raw = old->h.raw + offset;
- new->nh.raw = old->nh.raw + offset;
- new->mac.raw = old->mac.raw + offset;
- memcpy(new->cb, old->cb, sizeof(old->cb));
- new->local_df = old->local_df;
- new->fclone = SKB_FCLONE_UNAVAILABLE;
- new->pkt_type = old->pkt_type;
- new->tstamp = old->tstamp;
- new->destructor = NULL;
-#ifdef CONFIG_NETFILTER
- new->nfmark = old->nfmark;
- new->nfct = old->nfct;
- nf_conntrack_get(old->nfct);
- new->nfctinfo = old->nfctinfo;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- new->nfct_reasm = old->nfct_reasm;
- nf_conntrack_get_reasm(old->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
- new->ipvs_property = old->ipvs_property;
-#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
- new->nf_bridge = old->nf_bridge;
- nf_bridge_get(old->nf_bridge);
-#endif
-#endif
-#ifdef CONFIG_NET_SCHED
-#ifdef CONFIG_NET_CLS_ACT
- new->tc_verd = old->tc_verd;
-#endif
- new->tc_index = old->tc_index;
-#endif
- atomic_set(&new->users, 1);
- skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
- skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
}
/**
@@ -526,12 +932,11 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb->data - skb->head;
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
- gfp_mask);
+ int headerlen = skb_headroom(skb);
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+
if (!n)
return NULL;
@@ -539,8 +944,6 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
skb_reserve(n, headerlen);
/* Set the tail pointer and length */
skb_put(n, skb->len);
- n->csum = skb->csum;
- n->ip_summed = skb->ip_summed;
if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
BUG();
@@ -548,12 +951,16 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
copy_skb_header(n, skb);
return n;
}
-
+EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
@@ -563,39 +970,43 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
{
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
+ unsigned int size = skb_headlen(skb) + headroom;
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb->data - skb->head);
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
- memcpy(n->data, skb->data, n->len);
- n->csum = skb->csum;
- n->ip_summed = skb->ip_summed;
+ skb_copy_from_linear_data(skb, n->data, n->len);
+ n->truesize += skb->data_len;
n->data_len = skb->data_len;
n->len = skb->len;
if (skb_shinfo(skb)->nr_frags) {
int i;
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
+ }
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
- get_page(skb_shinfo(n)->frags[i].page);
+ skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
}
- if (skb_shinfo(skb)->frag_list) {
+ if (skb_has_frag_list(skb)) {
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
skb_clone_fraglist(n);
}
@@ -604,6 +1015,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
out:
return n;
}
+EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -612,8 +1024,8 @@ out:
* @ntail: room to add at tail
* @gfp_mask: allocation priority
*
- * Expands (or creates identical copy, if &nhead and &ntail are zero)
- * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * Expands (or creates identical copy, if @nhead and @ntail are zero)
+ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
* reference count of 1. Returns zero in the case of success or error,
* if expansion failed. In the last case, &sk_buff is not changed.
*
@@ -626,48 +1038,77 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
{
int i;
u8 *data;
- int size = nhead + (skb->end - skb->head) + ntail;
+ int size = nhead + skb_end_offset(skb) + ntail;
long off;
+ BUG_ON(nhead < 0);
+
if (skb_shared(skb))
BUG();
size = SKB_DATA_ALIGN(size);
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
+ size = SKB_WITH_OVERHEAD(ksize(data));
/* Copy only real data... and, alas, header. This should be
- * optimized for the cases when header is void. */
- memcpy(data + nhead, skb->head, skb->tail - skb->head);
- memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
+ * optimized for the cases when header is void.
+ */
+ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- get_page(skb_shinfo(skb)->frags[i].page);
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
- if (skb_shinfo(skb)->frag_list)
- skb_clone_fraglist(skb);
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
+ /* copy this zero copy skb frags */
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
- skb_release_data(skb);
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
+ }
off = (data + nhead) - skb->head;
skb->head = data;
- skb->end = data + size;
+ skb->head_frag = 0;
skb->data += off;
- skb->tail += off;
- skb->mac.raw += off;
- skb->h.raw += off;
- skb->nh.raw += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+ off = nhead;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb->tail += off;
+ skb_headers_offset_update(skb, nhead);
skb->cloned = 0;
+ skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
+nofrags:
+ kfree(data);
nodata:
return -ENOMEM;
}
+EXPORT_SYMBOL(pskb_expand_head);
/* Make private copy of skb with writable head and some headroom */
@@ -688,7 +1129,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
return skb2;
}
-
+EXPORT_SYMBOL(skb_realloc_headroom);
/**
* skb_copy_expand - copy and expand sk_buff
@@ -707,9 +1148,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
*
* You must pass %GFP_ATOMIC as the allocation priority if this function
* is called from an interrupt.
- *
- * BUG ALERT: ip_summed is not copied. Why does this work? Is it used
- * only by netfilter in the cases when checksum is recalculated? --ANK
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
int newheadroom, int newtailroom,
@@ -718,8 +1156,10 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
+ int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
if (!n)
@@ -730,7 +1170,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/* Set the tail pointer and length */
skb_put(n, skb->len);
- head_copy_len = skb_headroom(skb);
+ head_copy_len = oldheadroom;
head_copy_off = 0;
if (newheadroom <= head_copy_len)
head_copy_len = newheadroom;
@@ -744,8 +1184,11 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
copy_skb_header(n, skb);
+ skb_headers_offset_update(n, newheadroom - oldheadroom);
+
return n;
}
+EXPORT_SYMBOL(skb_copy_expand);
/**
* skb_pad - zero pad the tail of an skb
@@ -756,73 +1199,223 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
- * May return NULL in out of memory cases.
+ * May return error in out of memory cases. The skb is freed on error.
*/
-
-struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
+
+int skb_pad(struct sk_buff *skb, int pad)
{
- struct sk_buff *nskb;
-
+ int err;
+ int ntail;
+
/* If the skbuff is non linear tailroom is always zero.. */
- if (skb_tailroom(skb) >= pad) {
+ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
memset(skb->data+skb->len, 0, pad);
- return skb;
+ return 0;
+ }
+
+ ntail = skb->data_len + pad - (skb->end - skb->tail);
+ if (likely(skb_cloned(skb) || ntail > 0)) {
+ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+ if (unlikely(err))
+ goto free_skb;
}
-
- nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
+
+ /* FIXME: The use of this function with non-linear skb's really needs
+ * to be audited.
+ */
+ err = skb_linearize(skb);
+ if (unlikely(err))
+ goto free_skb;
+
+ memset(skb->data + skb->len, 0, pad);
+ return 0;
+
+free_skb:
kfree_skb(skb);
- if (nskb)
- memset(nskb->data+nskb->len, 0, pad);
- return nskb;
-}
-
-/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
- * If realloc==0 and trimming is impossible without change of data,
- * it is BUG().
+ return err;
+}
+EXPORT_SYMBOL(skb_pad);
+
+/**
+ * pskb_put - add data to the tail of a potentially fragmented buffer
+ * @skb: start of the buffer to use
+ * @tail: tail fragment of the buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the potentially
+ * fragmented buffer. @tail must be the last fragment of @skb -- or
+ * @skb itself. If this would exceed the total buffer size the kernel
+ * will panic. A pointer to the first byte of the extra data is
+ * returned.
*/
-int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
+unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+ if (tail != skb) {
+ skb->data_len += len;
+ skb->len += len;
+ }
+ return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+
+/**
+ * skb_put - add data to a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer. If this would
+ * exceed the total buffer size the kernel will panic. A pointer to the
+ * first byte of the extra data is returned.
+ */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
{
+ unsigned char *tmp = skb_tail_pointer(skb);
+ SKB_LINEAR_ASSERT(skb);
+ skb->tail += len;
+ skb->len += len;
+ if (unlikely(skb->tail > skb->end))
+ skb_over_panic(skb, len, __builtin_return_address(0));
+ return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ * skb_push - add data to the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer at the buffer
+ * start. If this would exceed the total buffer headroom the kernel will
+ * panic. A pointer to the first byte of the extra data is returned.
+ */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
+{
+ skb->data -= len;
+ skb->len += len;
+ if (unlikely(skb->data<skb->head))
+ skb_under_panic(skb, len, __builtin_return_address(0));
+ return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ * skb_pull - remove data from the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the next data in the buffer
+ * is returned. Once the data has been pulled future pushes will overwrite
+ * the old data.
+ */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+ return skb_pull_inline(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ * skb_trim - remove end from a buffer
+ * @skb: buffer to alter
+ * @len: new length
+ *
+ * Cut the length of a buffer down by removing data from the tail. If
+ * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->len > len)
+ __skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
+/* Trims skb to length len. It can change skb pointers.
+ */
+
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
+{
+ struct sk_buff **fragp;
+ struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)->nr_frags;
int i;
+ int err;
- for (i = 0; i < nfrags; i++) {
- int end = offset + skb_shinfo(skb)->frags[i].size;
- if (end > len) {
- if (skb_cloned(skb)) {
- BUG_ON(!realloc);
- if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
- return -ENOMEM;
- }
- if (len <= offset) {
- put_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->nr_frags--;
- } else {
- skb_shinfo(skb)->frags[i].size = len - offset;
- }
+ if (skb_cloned(skb) &&
+ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
+ return err;
+
+ i = 0;
+ if (offset >= len)
+ goto drop_pages;
+
+ for (; i < nfrags; i++) {
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
+
+drop_pages:
+ skb_shinfo(skb)->nr_frags = i;
+
+ for (; i < nfrags; i++)
+ skb_frag_unref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_drop_fraglist(skb);
+ goto done;
+ }
+
+ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
+ fragp = &frag->next) {
+ int end = offset + frag->len;
+
+ if (skb_shared(frag)) {
+ struct sk_buff *nfrag;
+
+ nfrag = skb_clone(frag, GFP_ATOMIC);
+ if (unlikely(!nfrag))
+ return -ENOMEM;
+
+ nfrag->next = frag->next;
+ consume_skb(frag);
+ frag = nfrag;
+ *fragp = frag;
+ }
+
+ if (end < len) {
+ offset = end;
+ continue;
}
- offset = end;
+
+ if (end > len &&
+ unlikely((err = pskb_trim(frag, len - offset))))
+ return err;
+
+ if (frag->next)
+ skb_drop_list(&frag->next);
+ break;
}
- if (offset < len) {
+done:
+ if (len > skb_headlen(skb)) {
skb->data_len -= skb->len - len;
skb->len = len;
} else {
- if (len <= skb_headlen(skb)) {
- skb->len = len;
- skb->data_len = 0;
- skb->tail = skb->data + len;
- if (skb_shinfo(skb)->frag_list && !skb_cloned(skb))
- skb_drop_fraglist(skb);
- } else {
- skb->data_len -= skb->len - len;
- skb->len = len;
- }
+ skb->len = len;
+ skb->data_len = 0;
+ skb_set_tail_pointer(skb, len);
}
return 0;
}
+EXPORT_SYMBOL(___pskb_trim);
/**
* __pskb_pull_tail - advance tail of skb header
@@ -863,21 +1456,23 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
return NULL;
}
- if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
+ if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
BUG();
/* Optimization: no fragments, no reasons to preestimate
* size of pulled pages. Superb.
*/
- if (!skb_shinfo(skb)->frag_list)
+ if (!skb_has_frag_list(skb))
goto pull_pages;
/* Estimate size of pulled pages. */
eat = delta;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size >= eat)
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
goto pull_pages;
- eat -= skb_shinfo(skb)->frags[i].size;
+ eat -= size;
}
/* If we need update frag list, we are in troubles.
@@ -916,8 +1511,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
insp = list;
}
if (!pskb_pull(list, eat)) {
- if (clone)
- kfree_skb(clone);
+ kfree_skb(clone);
return NULL;
}
break;
@@ -941,14 +1535,16 @@ pull_pages:
eat = delta;
k = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
if (eat) {
skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
eat = 0;
}
k++;
@@ -959,15 +1555,30 @@ pull_pages:
skb->tail += delta;
skb->data_len -= delta;
- return skb->tail;
+ return skb_tail_pointer(skb);
}
+EXPORT_SYMBOL(__pskb_pull_tail);
-/* Copy some data bits from skb to kernel buffer. */
-
+/**
+ * skb_copy_bits - copy bits from skb to kernel buffer
+ * @skb: source skb
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source skb to the
+ * destination buffer.
+ *
+ * CAUTION ! :
+ * If its prototype is ever changed,
+ * check arch/{*}/net/{*}.S files,
+ * since it is called from BPF assembly code.
+ */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
- int i, copy;
int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
if (offset > (int)skb->len - len)
goto fault;
@@ -976,7 +1587,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
if ((copy = start - offset) > 0) {
if (copy > len)
copy = len;
- memcpy(to, skb->data + offset, copy);
+ skb_copy_from_linear_data_offset(skb, offset, to, copy);
if ((len -= copy) == 0)
return 0;
offset += copy;
@@ -985,21 +1596,22 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(to,
- vaddr + skb_shinfo(skb)->frags[i].page_offset+
- offset - start, copy);
- kunmap_skb_frag(vaddr);
+ vaddr + f->page_offset + offset - start,
+ copy);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1009,35 +1621,235 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_bits(list, offset - start,
- to, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- to += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_bits(frag_iter, offset - start, to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
}
+ start = end;
}
+
if (!len)
return 0;
fault:
return -EFAULT;
}
+EXPORT_SYMBOL(skb_copy_bits);
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sock *sk)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
+
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
+
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
+
+ return pfrag->page;
+}
+
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ bool linear,
+ struct sock *sk)
+{
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
+
+ if (linear) {
+ page = linear_to_page(page, len, &offset, sk);
+ if (!page)
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = *len;
+ spd->partial[spd->nr_pages].offset = offset;
+ spd->nr_pages++;
+
+ return false;
+}
+
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
+{
+ if (!*len)
+ return true;
+
+ /* skip this segment if already processed */
+ if (*off >= plen) {
+ *off -= plen;
+ return false;
+ }
+
+ /* ignore any bits we already processed */
+ poff += *off;
+ plen -= *off;
+ *off = 0;
+
+ do {
+ unsigned int flen = min(*len, plen);
+
+ if (spd_fill_page(spd, pipe, page, &flen, poff,
+ linear, sk))
+ return true;
+ poff += flen;
+ plen -= flen;
+ *len -= flen;
+ } while (*len && plen);
+
+ return false;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports true if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
+{
+ int seg;
+
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
+ */
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+ offset, len, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
+
+ /*
+ * then map the fragments
+ */
+ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (__splice_segment(skb_frag_page(f),
+ f->page_offset, skb_frag_size(f),
+ offset, len, spd, false, sk, pipe))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list. It does NOT handle frag lists within
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+{
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
+ .flags = flags,
+ .ops = &nosteal_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+ struct sk_buff *frag_iter;
+ struct sock *sk = skb->sk;
+ int ret = 0;
+
+ /*
+ * __skb_splice_bits() only fails if the output has no room left,
+ * so no point in going over the frag_list for the error case.
+ */
+ if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
+ goto done;
+ else if (!tlen)
+ goto done;
+
+ /*
+ * now see if we have a frag_list to map
+ */
+ skb_walk_frags(skb, frag_iter) {
+ if (!tlen)
+ break;
+ if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
+ break;
+ }
+
+done:
+ if (spd.nr_pages) {
+ /*
+ * Drop the socket lock, otherwise we have reverse
+ * locking dependencies between sk_lock and i_mutex
+ * here as compared to sendfile(). We enter here
+ * with the socket lock held, and splice_to_pipe() will
+ * grab the pipe inode lock. For sendfile() emulation,
+ * we call into ->sendpage() with the i_mutex lock held
+ * and networking will grab the socket lock.
+ */
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, &spd);
+ lock_sock(sk);
+ }
+
+ return ret;
+}
/**
* skb_store_bits - store bits from kernel buffer to skb
@@ -1051,10 +1863,11 @@ fault:
* traversing fragment lists and such.
*/
-int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
- int i, copy;
int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
if (offset > (int)skb->len - len)
goto fault;
@@ -1062,7 +1875,7 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
if ((copy = start - offset) > 0) {
if (copy > len)
copy = len;
- memcpy(skb->data + offset, from, copy);
+ skb_copy_to_linear_data_offset(skb, offset, from, copy);
if ((len -= copy) == 0)
return 0;
offset += copy;
@@ -1073,19 +1886,19 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + frag->size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
memcpy(vaddr + frag->page_offset + offset - start,
from, copy);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
@@ -1095,28 +1908,24 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_store_bits(list, offset - start,
- from, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- from += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_store_bits(frag_iter, offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
}
+ start = end;
}
if (!len)
return 0;
@@ -1124,23 +1933,22 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
fault:
return -EFAULT;
}
-
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-
-unsigned int skb_checksum(const struct sk_buff *skb, int offset,
- int len, unsigned int csum)
+__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
+ __wsum csum, const struct skb_checksum_ops *ops)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
int pos = 0;
/* Checksum header. */
if (copy > 0) {
if (copy > len)
copy = len;
- csum = csum_partial(skb->data + offset, copy, csum);
+ csum = ops->update(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -1149,22 +1957,22 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- unsigned int csum2;
+ __wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
- csum2 = csum_partial(vaddr + frag->page_offset +
- offset - start, copy, 0);
- kunmap_skb_frag(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ csum2 = ops->update(vaddr + frag->page_offset +
+ offset - start, copy, 0);
+ kunmap_atomic(vaddr);
+ csum = ops->combine(csum, csum2, pos, copy);
if (!(len -= copy))
return csum;
offset += copy;
@@ -1173,42 +1981,52 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset,
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ int end;
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- unsigned int csum2;
- if (copy > len)
- copy = len;
- csum2 = skb_checksum(list, offset - start,
- copy, 0);
- csum = csum_block_add(csum, csum2, pos);
- if ((len -= copy) == 0)
- return csum;
- offset += copy;
- pos += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ if (copy > len)
+ copy = len;
+ csum2 = __skb_checksum(frag_iter, offset - start,
+ copy, 0, ops);
+ csum = ops->combine(csum, csum2, pos, copy);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos += copy;
}
+ start = end;
}
BUG_ON(len);
return csum;
}
+EXPORT_SYMBOL(__skb_checksum);
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ const struct skb_checksum_ops ops = {
+ .update = csum_partial_ext,
+ .combine = csum_block_add_ext,
+ };
+
+ return __skb_checksum(skb, offset, len, csum, &ops);
+}
+EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
-unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
- u8 *to, int len, unsigned int csum)
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+ u8 *to, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
+ struct sk_buff *frag_iter;
int pos = 0;
/* Copy header. */
@@ -1227,22 +2045,22 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
- BUG_TRAP(start <= offset + len);
+ WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
- unsigned int csum2;
+ __wsum csum2;
u8 *vaddr;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial_copy_nocheck(vaddr +
frag->page_offset +
offset - start, to,
copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
@@ -1253,61 +2071,157 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
start = end;
}
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ skb_walk_frags(skb, frag_iter) {
+ __wsum csum2;
+ int end;
- for (; list; list = list->next) {
- unsigned int csum2;
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- csum2 = skb_copy_and_csum_bits(list,
- offset - start,
- to, copy, 0);
- csum = csum_block_add(csum, csum2, pos);
- if ((len -= copy) == 0)
- return csum;
- offset += copy;
- to += copy;
- pos += copy;
- }
- start = end;
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ csum2 = skb_copy_and_csum_bits(frag_iter,
+ offset - start,
+ to, copy, 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
}
+ start = end;
}
BUG_ON(len);
return csum;
}
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
+
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(from);
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
+ */
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ int ret;
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
+
+ if (hlen) {
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
- unsigned int csum;
+ __wsum csum;
long csstart;
- if (skb->ip_summed == CHECKSUM_HW)
- csstart = skb->h.raw - skb->data;
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csstart = skb_checksum_start_offset(skb);
else
csstart = skb_headlen(skb);
BUG_ON(csstart > skb_headlen(skb));
- memcpy(to, skb->data, csstart);
+ skb_copy_from_linear_data(skb, to, csstart);
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
skb->len - csstart, 0);
- if (skb->ip_summed == CHECKSUM_HW) {
- long csstuff = csstart + skb->csum;
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ long csstuff = csstart + skb->csum_offset;
- *((unsigned short *)(to + csstuff)) = csum_fold(csum);
+ *((__sum16 *)(to + csstuff)) = csum_fold(csum);
}
}
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
/**
* skb_dequeue - remove from the head of the queue
@@ -1328,6 +2242,7 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list)
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
+EXPORT_SYMBOL(skb_dequeue);
/**
* skb_dequeue_tail - remove from the tail of the queue
@@ -1347,6 +2262,7 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
+EXPORT_SYMBOL(skb_dequeue_tail);
/**
* skb_queue_purge - empty a list
@@ -1362,6 +2278,7 @@ void skb_queue_purge(struct sk_buff_head *list)
while ((skb = skb_dequeue(list)) != NULL)
kfree_skb(skb);
}
+EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_queue_head - queue a buffer at the list head
@@ -1382,6 +2299,7 @@ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_head(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_queue_head);
/**
* skb_queue_tail - queue a buffer at the list tail
@@ -1402,6 +2320,7 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_tail(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_queue_tail);
/**
* skb_unlink - remove a buffer from a list
@@ -1421,6 +2340,7 @@ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
__skb_unlink(skb, list);
spin_unlock_irqrestore(&list->lock, flags);
}
+EXPORT_SYMBOL(skb_unlink);
/**
* skb_append - append a buffer
@@ -1437,10 +2357,10 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
- __skb_append(old, newsk, list);
+ __skb_queue_after(list, old, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
-
+EXPORT_SYMBOL(skb_append);
/**
* skb_insert - insert a buffer
@@ -1462,19 +2382,7 @@ void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head
__skb_insert(newsk, old->prev, old, list);
spin_unlock_irqrestore(&list->lock, flags);
}
-
-#if 0
-/*
- * Tune the memory allocator for a new MTU size.
- */
-void skb_add_mtu(int mtu)
-{
- /* Must match allocation in alloc_skb */
- mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
-
- kmem_add_cache_size(mtu);
-}
-#endif
+EXPORT_SYMBOL(skb_insert);
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
@@ -1482,8 +2390,8 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
{
int i;
- memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
-
+ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+ pos - len);
/* And move data appendix as is. */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
@@ -1494,7 +2402,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb1->len += skb1->data_len;
skb->data_len = 0;
skb->len = len;
- skb->tail = skb->data + len;
+ skb_set_tail_pointer(skb, len);
}
static inline void skb_split_no_header(struct sk_buff *skb,
@@ -1510,7 +2418,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
skb->data_len = len - pos;
for (i = 0; i < nfrags; i++) {
- int size = skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) {
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
@@ -1524,10 +2432,10 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- get_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_ref(skb, i);
skb_shinfo(skb1)->frags[0].page_offset += len - pos;
- skb_shinfo(skb1)->frags[0].size -= len - pos;
- skb_shinfo(skb)->frags[i].size = len - pos;
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++;
@@ -1548,11 +2456,156 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
skb_split_no_header(skb, skb1, len, pos);
}
+EXPORT_SYMBOL(skb_split);
+
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * Caller cannot keep skb_shinfo related pointers past calling here!
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+ return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+ int from, to, merge, todo;
+ struct skb_frag_struct *fragfrom, *fragto;
+
+ BUG_ON(shiftlen > skb->len);
+ BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ todo = shiftlen;
+ from = 0;
+ to = skb_shinfo(tgt)->nr_frags;
+ fragfrom = &skb_shinfo(skb)->frags[from];
+
+ /* Actual merge is delayed until the point when we know we can
+ * commit all, so that we don't have to undo partial changes
+ */
+ if (!to ||
+ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ fragfrom->page_offset)) {
+ merge = -1;
+ } else {
+ merge = to - 1;
+
+ todo -= skb_frag_size(fragfrom);
+ if (todo < 0) {
+ if (skb_prepare_for_shift(skb) ||
+ skb_prepare_for_shift(tgt))
+ return 0;
+
+ /* All previous frag pointers might be stale! */
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
+ fragfrom->page_offset += shiftlen;
+
+ goto onlymerged;
+ }
+
+ from++;
+ }
+
+ /* Skip full, not-fitting skb to avoid expensive operations */
+ if ((shiftlen == skb->len) &&
+ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+ return 0;
+
+ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+ return 0;
+
+ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+ if (to == MAX_SKB_FRAGS)
+ return 0;
+
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[to];
+
+ if (todo >= skb_frag_size(fragfrom)) {
+ *fragto = *fragfrom;
+ todo -= skb_frag_size(fragfrom);
+ from++;
+ to++;
+
+ } else {
+ __skb_frag_ref(fragfrom);
+ fragto->page = fragfrom->page;
+ fragto->page_offset = fragfrom->page_offset;
+ skb_frag_size_set(fragto, todo);
+
+ fragfrom->page_offset += todo;
+ skb_frag_size_sub(fragfrom, todo);
+ todo = 0;
+
+ to++;
+ break;
+ }
+ }
+
+ /* Ready to "commit" this state change to tgt */
+ skb_shinfo(tgt)->nr_frags = to;
+
+ if (merge >= 0) {
+ fragfrom = &skb_shinfo(skb)->frags[0];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom);
+ }
+
+ /* Reposition in the original skb */
+ to = 0;
+ while (from < skb_shinfo(skb)->nr_frags)
+ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+ skb_shinfo(skb)->nr_frags = to;
+
+ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+ /* Most likely the tgt won't ever need its checksum anymore, skb on
+ * the other hand might need it if it needs to be resent
+ */
+ tgt->ip_summed = CHECKSUM_PARTIAL;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ /* Yak, is it really working this way? Some helper please? */
+ skb->len -= shiftlen;
+ skb->data_len -= shiftlen;
+ skb->truesize -= shiftlen;
+ tgt->len += shiftlen;
+ tgt->data_len += shiftlen;
+ tgt->truesize += shiftlen;
+
+ return shiftlen;
+}
/**
* skb_prepare_seq_read - Prepare a sequential read of skb data
@@ -1573,6 +2626,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
}
+EXPORT_SYMBOL(skb_prepare_seq_read);
/**
* skb_seq_read - Sequentially read skb data
@@ -1580,22 +2634,22 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
* @data: destination pointer for data to be returned
* @st: state variable
*
- * Reads a block of skb data at &consumed relative to the
+ * Reads a block of skb data at @consumed relative to the
* lower offset specified to skb_prepare_seq_read(). Assigns
- * the head of the data block to &data and returns the length
+ * the head of the data block to @data and returns the length
* of the block or 0 if the end of the skb data or the upper
* offset has been reached.
*
* The caller is not required to consume all of the data
- * returned, i.e. &consumed is typically set to the number
+ * returned, i.e. @consumed is typically set to the number
* of bytes already consumed and the next call to
* skb_seq_read() will return the remaining part of the block.
*
- * Note: The size of each block of data returned can be arbitary,
+ * Note 1: The size of each block of data returned can be arbitrary,
* this limitation is the cost for zerocopy seqeuental
* reads of potentially non linear data.
*
- * Note: Fragment lists within fragments are not implemented
+ * Note 2: Fragment lists within fragments are not implemented
* at the moment, state->root_skb could be replaced with
* a stack for this purpose.
*/
@@ -1605,14 +2659,19 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
unsigned int block_limit, abs_offset = consumed + st->lower_offset;
skb_frag_t *frag;
- if (unlikely(abs_offset >= st->upper_offset))
+ if (unlikely(abs_offset >= st->upper_offset)) {
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
return 0;
+ }
next_skb:
- block_limit = skb_headlen(st->cur_skb);
+ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
- if (abs_offset < block_limit) {
- *data = st->cur_skb->data + abs_offset;
+ if (abs_offset < block_limit && !st->frag_data) {
+ *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
@@ -1621,11 +2680,11 @@ next_skb:
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = frag->size + st->stepped_offset;
+ block_limit = skb_frag_size(frag) + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_skb_frag(frag);
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
*data = (u8 *) st->frag_data + frag->page_offset +
(abs_offset - st->stepped_offset);
@@ -1634,26 +2693,32 @@ next_skb:
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
st->frag_idx++;
- st->stepped_offset += frag->size;
+ st->stepped_offset += skb_frag_size(frag);
}
- if (st->cur_skb->next) {
- st->cur_skb = st->cur_skb->next;
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
st->frag_idx = 0;
goto next_skb;
- } else if (st->root_skb == st->cur_skb &&
- skb_shinfo(st->root_skb)->frag_list) {
- st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ } else if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
goto next_skb;
}
return 0;
}
+EXPORT_SYMBOL(skb_seq_read);
/**
* skb_abort_seq_read - Abort a sequential read of skb data
@@ -1665,8 +2730,9 @@ next_skb:
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
}
+EXPORT_SYMBOL(skb_abort_seq_read);
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
@@ -1699,16 +2765,20 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config,
struct ts_state *state)
{
+ unsigned int ret;
+
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
- return textsearch_find(config, state);
+ ret = textsearch_find(config, state);
+ return (ret <= to - from ? ret : UINT_MAX);
}
+EXPORT_SYMBOL(skb_find_text);
/**
- * skb_append_datato_frags: - append the user data to a skb
+ * skb_append_datato_frags - append the user data to a skb
* @sk: sock structure
* @skb: skb structure to be appened with user data.
* @getfrag: call back function to be used for getting the user data
@@ -1723,52 +2793,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
int len, int odd, struct sk_buff *skb),
void *from, int length)
{
- int frg_cnt = 0;
- skb_frag_t *frag = NULL;
- struct page *page = NULL;
- int copy, left;
+ int frg_cnt = skb_shinfo(skb)->nr_frags;
+ int copy;
int offset = 0;
int ret;
+ struct page_frag *pfrag = &current->task_frag;
do {
/* Return error if we don't have space for new frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
if (frg_cnt >= MAX_SKB_FRAGS)
- return -EFAULT;
+ return -EMSGSIZE;
- /* allocate a new page for next frag */
- page = alloc_pages(sk->sk_allocation, 0);
-
- /* If alloc_page fails just return failure and caller will
- * free previous allocated pages by doing kfree_skb()
- */
- if (page == NULL)
+ if (!sk_page_frag_refill(sk, pfrag))
return -ENOMEM;
- /* initialize the next frag */
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
- skb->truesize += PAGE_SIZE;
- atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
-
- /* get the new initialized frag */
- frg_cnt = skb_shinfo(skb)->nr_frags;
- frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
-
/* copy the user data to page */
- left = PAGE_SIZE - frag->page_offset;
- copy = (length > left)? left : length;
+ copy = min_t(int, length, pfrag->size - pfrag->offset);
- ret = getfrag(from, (page_address(frag->page) +
- frag->page_offset + frag->size),
- offset, copy, 0, skb);
+ ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
+ offset, copy, 0, skb);
if (ret < 0)
return -EFAULT;
/* copy was successful so update the size parameters */
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
+ skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
+ copy);
+ frg_cnt++;
+ pfrag->offset += copy;
+ get_page(pfrag->page);
+
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
skb->len += copy;
skb->data_len += copy;
offset += copy;
@@ -1778,56 +2833,1129 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
return 0;
}
+EXPORT_SYMBOL(skb_append_datato_frags);
+
+/**
+ * skb_pull_rcsum - pull skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_pull on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_pull unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+ BUG_ON(len > skb->len);
+ skb->len -= len;
+ BUG_ON(skb->len < skb->data_len);
+ skb_postpull_rcsum(skb, skb->data, len);
+ return skb->data += len;
+}
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @head_skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function performs segmentation on the given skb. It returns
+ * a pointer to the first in a list of new skbs for the segments.
+ * In case of error it returns ERR_PTR(err).
+ */
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ skb_frag_t *frag = skb_shinfo(head_skb)->frags;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+ struct sk_buff *frag_skb = head_skb;
+ unsigned int offset = doffset;
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int headroom;
+ unsigned int len;
+ __be16 proto;
+ bool csum;
+ int sg = !!(features & NETIF_F_SG);
+ int nfrags = skb_shinfo(head_skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+ int dummy;
+
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ csum = !head_skb->encap_hdr_csum &&
+ !!can_checksum_protocol(features, proto);
+
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *nskb_frag;
+ int hsize;
+ int size;
+
+ len = head_skb->len - offset;
+ if (len > mss)
+ len = mss;
+
+ hsize = skb_headlen(head_skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
+ if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ size = skb_frag_size(frag);
+ if (pos + size > offset + len)
+ break;
+
+ i++;
+ pos += size;
+ frag++;
+ }
+
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ list_skb = list_skb->next;
+
+ if (unlikely(!nskb))
+ goto err;
+
+ if (unlikely(pskb_trim(nskb, len))) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ hsize = skb_end_offset(nskb);
+ if (skb_cow_head(nskb, doffset + headroom)) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ nskb->truesize += skb_end_offset(nskb) - hsize;
+ skb_release_head_state(nskb);
+ __skb_push(nskb, doffset);
+ } else {
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+ NUMA_NO_NODE);
+
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, headroom);
+ __skb_put(nskb, doffset);
+ }
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ __copy_skb_header(nskb, head_skb);
+ nskb->mac_len = head_skb->mac_len;
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
+
+ if (nskb->len == len + doffset)
+ goto perform_csum_check;
+
+ if (!sg) {
+ nskb->ip_summed = CHECKSUM_NONE;
+ nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ continue;
+ }
+
+ nskb_frag = skb_shinfo(nskb)->frags;
+
+ skb_copy_from_linear_data_offset(head_skb, offset,
+ skb_put(nskb, hsize), hsize);
+
+ skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
+
+ while (pos < offset + len) {
+ if (i >= nfrags) {
+ BUG_ON(skb_headlen(list_skb));
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+
+ BUG_ON(!nfrags);
+
+ list_skb = list_skb->next;
+ }
+
+ if (unlikely(skb_shinfo(nskb)->nr_frags >=
+ MAX_SKB_FRAGS)) {
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
+ goto err;
+ }
+
+ if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
+ goto err;
+
+ *nskb_frag = *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
+
+ if (pos < offset) {
+ nskb_frag->page_offset += offset - pos;
+ skb_frag_size_sub(nskb_frag, offset - pos);
+ }
+
+ skb_shinfo(nskb)->nr_frags++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ frag++;
+ pos += size;
+ } else {
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
+ goto skip_fraglist;
+ }
+
+ nskb_frag++;
+ }
+
+skip_fraglist:
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+
+perform_csum_check:
+ if (!csum) {
+ nskb->csum = skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ }
+ } while ((offset += len) < head_skb->len);
+
+ return segs;
+
+err:
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(skb_segment);
+
+int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ struct sk_buff *nskb, *lp, *p = *head;
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int headroom;
+
+ if (unlikely(p->len + len >= 65536))
+ return -E2BIG;
+
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ frag->page_offset += offset;
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
+ skb->truesize -= skb->data_len;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+ if (pinfo->frag_list)
+ goto merge;
+ if (skb_gro_len(p) != pinfo->gso_size)
+ return -E2BIG;
+
+ headroom = skb_headroom(p);
+ nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
+ if (unlikely(!nskb))
+ return -ENOMEM;
+
+ __copy_skb_header(nskb, p);
+ nskb->mac_len = p->mac_len;
+
+ skb_reserve(nskb, headroom);
+ __skb_put(nskb, skb_gro_offset(p));
+
+ skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
+ skb_set_network_header(nskb, skb_network_offset(p));
+ skb_set_transport_header(nskb, skb_transport_offset(p));
+
+ __skb_pull(p, skb_gro_offset(p));
+ memcpy(skb_mac_header(nskb), skb_mac_header(p),
+ p->data - skb_mac_header(p));
+
+ skb_shinfo(nskb)->frag_list = p;
+ skb_shinfo(nskb)->gso_size = pinfo->gso_size;
+ pinfo->gso_size = 0;
+ skb_header_release(p);
+ NAPI_GRO_CB(nskb)->last = p;
+
+ nskb->data_len += p->len;
+ nskb->truesize += p->truesize;
+ nskb->len += p->len;
+
+ *head = nskb;
+ nskb->next = p->next;
+ p->next = NULL;
+
+ p = nskb;
+
+merge:
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_gro_receive);
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
sizeof(struct sk_buff),
0,
- SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!skbuff_head_cache)
- panic("cannot create skbuff cache");
-
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
(2*sizeof(struct sk_buff)) +
sizeof(atomic_t),
0,
- SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!skbuff_fclone_cache)
- panic("cannot create skbuff cache");
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
}
-EXPORT_SYMBOL(___pskb_trim);
-EXPORT_SYMBOL(__kfree_skb);
-EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(__alloc_skb);
-EXPORT_SYMBOL(pskb_copy);
-EXPORT_SYMBOL(pskb_expand_head);
-EXPORT_SYMBOL(skb_checksum);
-EXPORT_SYMBOL(skb_clone);
-EXPORT_SYMBOL(skb_clone_fraglist);
-EXPORT_SYMBOL(skb_copy);
-EXPORT_SYMBOL(skb_copy_and_csum_bits);
-EXPORT_SYMBOL(skb_copy_and_csum_dev);
-EXPORT_SYMBOL(skb_copy_bits);
-EXPORT_SYMBOL(skb_copy_expand);
-EXPORT_SYMBOL(skb_over_panic);
-EXPORT_SYMBOL(skb_pad);
-EXPORT_SYMBOL(skb_realloc_headroom);
-EXPORT_SYMBOL(skb_under_panic);
-EXPORT_SYMBOL(skb_dequeue);
-EXPORT_SYMBOL(skb_dequeue_tail);
-EXPORT_SYMBOL(skb_insert);
-EXPORT_SYMBOL(skb_queue_purge);
-EXPORT_SYMBOL(skb_queue_head);
-EXPORT_SYMBOL(skb_queue_tail);
-EXPORT_SYMBOL(skb_unlink);
-EXPORT_SYMBOL(skb_append);
-EXPORT_SYMBOL(skb_split);
-EXPORT_SYMBOL(skb_prepare_seq_read);
-EXPORT_SYMBOL(skb_seq_read);
-EXPORT_SYMBOL(skb_abort_seq_read);
-EXPORT_SYMBOL(skb_find_text);
-EXPORT_SYMBOL(skb_append_datato_frags);
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer.
+ */
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int elt = 0;
+
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ sg_set_buf(sg, skb->data + offset, copy);
+ elt++;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
+ frag->page_offset+offset-start);
+ elt++;
+ if (!(len -= copy))
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+ copy);
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len);
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
+/**
+ * skb_cow_data - Check that a socket buffer's data buffers are writable
+ * @skb: The socket buffer to check.
+ * @tailbits: Amount of trailing space to be added
+ * @trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ * Make sure that the data buffers attached to a socket buffer are
+ * writable. If they are not, private copies are made of the data buffers
+ * and the socket buffer is set to use these instead.
+ *
+ * If @tailbits is given, make sure that there is space to write @tailbits
+ * bytes of data beyond current end of socket buffer. @trailer will be
+ * set to point to the skb in which this space begins.
+ *
+ * The number of scatterlist elements required to completely map the
+ * COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+ int copyflag;
+ int elt;
+ struct sk_buff *skb1, **skb_p;
+
+ /* If skb is cloned or its head is paged, reallocate
+ * head pulling out all the pages (pages are considered not writable
+ * at the moment even if they are anonymous).
+ */
+ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ /* Easy case. Most of packets will go this way. */
+ if (!skb_has_frag_list(skb)) {
+ /* A little of trouble, not enough of space for trailer.
+ * This should not happen, when stack is tuned to generate
+ * good frames. OK, on miss we reallocate and reserve even more
+ * space, 128 bytes is fair. */
+
+ if (skb_tailroom(skb) < tailbits &&
+ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Voila! */
+ *trailer = skb;
+ return 1;
+ }
+
+ /* Misery. We are in troubles, going to mincer fragments... */
+
+ elt = 1;
+ skb_p = &skb_shinfo(skb)->frag_list;
+ copyflag = 0;
+
+ while ((skb1 = *skb_p) != NULL) {
+ int ntail = 0;
+
+ /* The fragment is partially pulled by someone,
+ * this can happen on input. Copy it and everything
+ * after it. */
+
+ if (skb_shared(skb1))
+ copyflag = 1;
+
+ /* If the skb is the last, worry about trailer. */
+
+ if (skb1->next == NULL && tailbits) {
+ if (skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1) ||
+ skb_tailroom(skb1) < tailbits)
+ ntail = tailbits + 128;
+ }
+
+ if (copyflag ||
+ skb_cloned(skb1) ||
+ ntail ||
+ skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1)) {
+ struct sk_buff *skb2;
+
+ /* Fuck, we are miserable poor guys... */
+ if (ntail == 0)
+ skb2 = skb_copy(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb_copy_expand(skb1,
+ skb_headroom(skb1),
+ ntail,
+ GFP_ATOMIC);
+ if (unlikely(skb2 == NULL))
+ return -ENOMEM;
+
+ if (skb1->sk)
+ skb_set_owner_w(skb2, skb1->sk);
+
+ /* Looking around. Are we still alive?
+ * OK, link new skb, drop old one */
+
+ skb2->next = skb1->next;
+ *skb_p = skb2;
+ kfree_skb(skb1);
+ skb1 = skb2;
+ }
+ elt++;
+ *trailer = skb1;
+ skb_p = &skb1->next;
+ }
+
+ return elt;
+}
+EXPORT_SYMBOL_GPL(skb_cow_data);
+
+static void sock_rmem_free(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf)
+ return -ENOMEM;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_rmem_free;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+ /* before exiting rcu section, make sure dst is refcounted */
+ skb_dst_force(skb);
+
+ skb_queue_tail(&sk->sk_error_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = orig_skb->sk;
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ int err;
+
+ if (!sk)
+ return;
+
+ if (hwtstamps) {
+ *skb_hwtstamps(orig_skb) =
+ *hwtstamps;
+ } else {
+ /*
+ * no hardware time stamps available,
+ * so keep the shared tx_flags and only
+ * store software time stamp
+ */
+ orig_skb->tstamp = ktime_get_real();
+ }
+
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ err = sock_queue_err_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+ if (unlikely(start > skb_headlen(skb)) ||
+ unlikely((int)start + off > skb_headlen(skb) - 2)) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb_headlen(skb));
+ return false;
+ }
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + start;
+ skb->csum_offset = off;
+ skb_set_transport_header(skb, start);
+ return true;
+}
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+
+static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
+ unsigned int max)
+{
+ if (skb_headlen(skb) >= len)
+ return 0;
+
+ /* If we need to pullup then pullup to the max, so we
+ * won't need to do it again.
+ */
+ if (max > skb->len)
+ max = skb->len;
+
+ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ if (skb_headlen(skb) < len)
+ return -EPROTO;
+
+ return 0;
+}
+
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * maximally sized IP and TCP or UDP headers.
+ */
+#define MAX_IP_HDR_LEN 128
+
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
+{
+ unsigned int off;
+ bool fragment;
+ __sum16 *csum;
+ int err;
+
+ fragment = false;
+
+ err = skb_maybe_pull_tail(skb,
+ sizeof(struct iphdr),
+ MAX_IP_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ fragment = true;
+
+ off = ip_hdrlen(skb);
+
+ err = -EPROTO;
+
+ if (fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * an IPv6 header, all options, and a maximal TCP or UDP header.
+ */
+#define MAX_IPV6_HDR_LEN 256
+
+#define OPT_HDR(type, skb, off) \
+ (type *)(skb_network_header(skb) + (off))
+
+static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+ u8 nexthdr;
+ unsigned int off;
+ unsigned int len;
+ bool fragment;
+ bool done;
+ __sum16 *csum;
+
+ fragment = false;
+ done = false;
+
+ off = sizeof(struct ipv6hdr);
+
+ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ while (off <= len && !done) {
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING: {
+ struct ipv6_opt_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ipv6_opt_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_optlen(hp);
+ break;
+ }
+ case IPPROTO_AH: {
+ struct ip_auth_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ip_auth_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ip_auth_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_authlen(hp);
+ break;
+ }
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct frag_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct frag_hdr, skb, off);
+
+ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
+ fragment = true;
+
+ nexthdr = hp->nexthdr;
+ off += sizeof(struct frag_hdr);
+ break;
+ }
+ default:
+ done = true;
+ break;
+ }
+ }
+
+ err = -EPROTO;
+
+ if (!done || fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * skb_checksum_setup - set up partial checksum offset
+ * @skb: the skb to set up
+ * @recalculate: if true the pseudo-header checksum will be recalculated
+ */
+int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ err = skb_checksum_setup_ipv4(skb, recalculate);
+ break;
+
+ case htons(ETH_P_IPV6):
+ err = skb_checksum_setup_ipv6(skb, recalculate);
+ break;
+
+ default:
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
+}
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen) {
+ skb_release_head_state(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+ } else {
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ if (skb_has_frag_list(to) || skb_has_frag_list(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+ skb_shinfo(from)->frags,
+ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+ if (!skb_cloned(from))
+ skb_shinfo(from)->nr_frags = 0;
+
+ /* if the skb is not cloned this does nothing
+ * since we set nr_frags to 0.
+ */
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+ skb_frag_ref(from, i);
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);
+
+/**
+ * skb_scrub_packet - scrub an skb
+ *
+ * @skb: buffer to clean
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
+ */
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ if (xnet)
+ skb_orphan(skb);
+ skb->tstamp.tv64 = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ skb->mark = 0;
+ secpath_reset(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+}
+EXPORT_SYMBOL_GPL(skb_scrub_packet);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ return tcp_hdrlen(skb) + shinfo->gso_size;
+
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return shinfo->gso_size;
+}
+EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e00811d44b..026e01f7027 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -7,8 +7,6 @@
* handler for protocols to use and generic option handler.
*
*
- * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -34,7 +32,7 @@
* Alan Cox : TCP ack handling is buggy, the DESTROY timer
* was buggy. Put a remove_sock() in the handler
* for memory when we hit 0. Also altered the timer
- * code. The ACK stuff can wait and needs major
+ * code. The ACK stuff can wait and needs major
* TCP layer surgery.
* Alan Cox : Fixed TCP ack bug, removed remove sock
* and fixed timer/inet_bh race.
@@ -91,9 +89,11 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
-#include <linux/config.h>
#include <linux/errno.h>
+#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
@@ -112,42 +112,273 @@
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
#include <linux/filter.h>
+#include <trace/events/sock.h>
+
#ifdef CONFIG_INET
#include <net/tcp.h>
#endif
+#include <net/busy_poll.h>
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capbility to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socke was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
+#ifdef CONFIG_MEMCG_KMEM
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(memcg, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+#if defined(CONFIG_MEMCG_KMEM)
+struct static_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+#endif
+
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *const af_family_key_strings[AF_MAX+1] = {
+ "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
+ "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
+ "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
+ "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
+ "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
+ "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
+ "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
+ "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
+ "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
+ "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
+ "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
+ "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
+ "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
+};
+static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+ "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
+ "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
+ "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
+ "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
+ "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
+ "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
+ "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
+ "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
+ "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
+ "slock-27" , "slock-28" , "slock-AF_CAN" ,
+ "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
+ "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+};
+static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+ "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
+ "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
+ "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
+ "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
+ "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
+ "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
+ "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
+ "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
+ "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
+ "clock-27" , "clock-28" , "clock-AF_CAN" ,
+ "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
+ "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
+ "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
+};
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values, since that is non-constant across
* platforms. This makes socket queueing behavior and performance
* not depend upon such differences.
*/
#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
+#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max = SK_WMEM_MAX;
-__u32 sysctl_rmem_max = SK_RMEM_MAX;
-__u32 sysctl_wmem_default = SK_WMEM_MAX;
-__u32 sysctl_rmem_default = SK_RMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancillary data plus some space */
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+EXPORT_SYMBOL(sysctl_optmem_max);
+
+struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(memalloc_socks);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_key_slow_inc(&memalloc_socks);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_key_slow_dec(&memalloc_socks);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. However, if SOCK_MEMALLOC is cleared while
+ * it has rmem allocations there is a risk that the user of the
+ * socket cannot make forward progress due to exceeding the rmem
+ * limits. By rights, sk_clear_memalloc() should only be called
+ * on sockets being torn down but warn and reset the accounting if
+ * that assumption breaks.
+ */
+ if (WARN_ON(sk->sk_forward_alloc))
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
-/* Maximal space eaten by iovec or ancilliary data plus some space */
-int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
@@ -157,7 +388,20 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
return -EINVAL;
if (copy_from_user(&tv, optval, sizeof(tv)))
return -EFAULT;
+ if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+ return -EDOM;
+
+ if (tv.tv_sec < 0) {
+ static int warned __read_mostly;
+ *timeo_p = 0;
+ if (warned < 10 && net_ratelimit()) {
+ warned++;
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
+ }
+ return 0;
+ }
*timeo_p = MAX_SCHEDULE_TIMEOUT;
if (tv.tv_sec == 0 && tv.tv_usec == 0)
return 0;
@@ -170,22 +414,250 @@ static void sock_warn_obsolete_bsdism(const char *name)
{
static int warned;
static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- printk(KERN_WARNING "process `%s' is using obsolete "
- "%s SO_BSDCOMPAT\n", warncomm, name);
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+ pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
+ warncomm, name);
warned++;
}
}
-static void sock_disable_timestamp(struct sock *sk)
-{
- if (sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_reset_flag(sk, SOCK_TIMESTAMP);
- net_disable_timestamp();
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
+{
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ net_disable_timestamp();
+ }
+}
+
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ int skb_len;
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+ atomic_inc(&sk->sk_drops);
+ trace_sock_rcvqueue_full(sk, skb);
+ return -ENOMEM;
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ atomic_inc(&sk->sk_drops);
+ return -ENOBUFS;
+ }
+
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+
+ /* Cache the SKB length before we tack it onto the receive
+ * queue. Once it is added it no longer belongs to us and
+ * may be freed by other threads of control pulling packets
+ * from the queue.
+ */
+ skb_len = skb->len;
+
+ /* we escape from rcu protected region, make sure we dont leak
+ * a norefcounted dst
+ */
+ skb_dst_force(skb);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb->dropcount = atomic_read(&sk->sk_drops);
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+{
+ int rc = NET_RX_SUCCESS;
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+ if (nested)
+ bh_lock_sock_nested(sk);
+ else
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ /*
+ * trylock + unlock semantics:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+ rc = sk_backlog_rcv(sk, skb);
+
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+
+ bh_unlock_sock(sk);
+out:
+ sock_put(sk);
+ return rc;
+discard_and_relse:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(sk_receive_skb);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_tx_queue_clear(sk);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_dst_reset(sk);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (optlen < 0)
+ goto out;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+ memset(devname, 0, sizeof(devname));
+
+ ret = -EFAULT;
+ if (copy_from_user(devname, optval, optlen))
+ goto out;
+
+ index = 0;
+ if (devname[0] != '\0') {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, devname);
+ if (dev)
+ index = dev->ifindex;
+ rcu_read_unlock();
+ ret = -ENODEV;
+ if (!dev)
+ goto out;
+ }
+
+ lock_sock(sk);
+ sk->sk_bound_dev_if = index;
+ sk_dst_reset(sk);
+ release_sock(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (sk->sk_bound_dev_if == 0) {
+ len = 0;
+ goto zero;
}
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
}
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
/*
* This is meant for all protocols to use and covers goings on
@@ -193,482 +665,677 @@ static void sock_disable_timestamp(struct sock *sk)
*/
int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
- struct sock *sk=sock->sk;
- struct sk_filter *filter;
+ struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
int ret = 0;
-
+
/*
* Options without arguments
*/
-#ifdef SO_DONTLINGER /* Compatibility item... */
- if (optname == SO_DONTLINGER) {
- lock_sock(sk);
- sock_reset_flag(sk, SOCK_LINGER);
- release_sock(sk);
- return 0;
- }
-#endif
-
- if(optlen<sizeof(int))
- return(-EINVAL);
-
+ if (optname == SO_BINDTODEVICE)
+ return sock_setbindtodevice(sk, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
if (get_user(val, (int __user *)optval))
return -EFAULT;
-
- valbool = val?1:0;
+
+ valbool = val ? 1 : 0;
lock_sock(sk);
- switch(optname)
- {
- case SO_DEBUG:
- if(val && !capable(CAP_NET_ADMIN))
- {
- ret = -EACCES;
- }
- else if (valbool)
- sock_set_flag(sk, SOCK_DBG);
- else
- sock_reset_flag(sk, SOCK_DBG);
- break;
- case SO_REUSEADDR:
- sk->sk_reuse = valbool;
- break;
- case SO_TYPE:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
- break;
- case SO_DONTROUTE:
- if (valbool)
- sock_set_flag(sk, SOCK_LOCALROUTE);
- else
- sock_reset_flag(sk, SOCK_LOCALROUTE);
- break;
- case SO_BROADCAST:
- sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
- break;
- case SO_SNDBUF:
- /* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_wmem_max)
- val = sysctl_wmem_max;
+ switch (optname) {
+ case SO_DEBUG:
+ if (val && !capable(CAP_NET_ADMIN))
+ ret = -EACCES;
+ else
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
+ break;
+ case SO_REUSEADDR:
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
+ break;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ ret = -ENOPROTOOPT;
+ break;
+ case SO_DONTROUTE:
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ break;
+ case SO_BROADCAST:
+ sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- if ((val * 2) < SOCK_MIN_SNDBUF)
- sk->sk_sndbuf = SOCK_MIN_SNDBUF;
- else
- sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ /* Wake up sending tasks if we upped the value. */
+ sk->sk_write_space(sk);
+ break;
- /*
- * Wake up sending tasks if we
- * upped the value.
- */
- sk->sk_write_space(sk);
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
break;
+ }
+ goto set_sndbuf;
- case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- break;
- }
- goto set_sndbuf;
-
- case SO_RCVBUF:
- /* Don't error on this BSD doesn't and if you think
- about it this is right. Otherwise apps have to
- play 'guess the biggest size' games. RCVBUF/SNDBUF
- are treated in BSD as hints */
-
- if (val > sysctl_rmem_max)
- val = sysctl_rmem_max;
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_rmem_max);
set_rcvbuf:
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /* FIXME: is this lower bound the right one? */
- if ((val * 2) < SOCK_MIN_RCVBUF)
- sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
- else
- sk->sk_rcvbuf = val * 2;
- break;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ /*
+ * We double it on the way in to account for
+ * "struct sk_buff" etc. overhead. Applications
+ * assume that the SO_RCVBUF setting they make will
+ * allow that much actual data to be received on that
+ * socket.
+ *
+ * Applications are unaware that "struct sk_buff" and
+ * other overheads allocate from the receive buffer
+ * during socket buffer allocation.
+ *
+ * And after considering the possible alternatives,
+ * returning the value we actually used in getsockopt
+ * is the most desirable behavior.
+ */
+ sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+ break;
- case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- break;
- }
- goto set_rcvbuf;
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
- case SO_KEEPALIVE:
+ case SO_KEEPALIVE:
#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP)
- tcp_set_keepalive(sk, valbool);
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ tcp_set_keepalive(sk, valbool);
#endif
- sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
- break;
-
- case SO_OOBINLINE:
- sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
- break;
-
- case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+ case SO_OOBINLINE:
+ sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ break;
+
+ case SO_NO_CHECK:
+ sk->sk_no_check_tx = valbool;
+ break;
+
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sk->sk_priority = val;
+ else
+ ret = -EPERM;
+ break;
+
+ case SO_LINGER:
+ if (optlen < sizeof(ling)) {
+ ret = -EINVAL; /* 1003.1g */
break;
-
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
- sk->sk_priority = val;
- else
- ret = -EPERM;
+ }
+ if (copy_from_user(&ling, optval, sizeof(ling))) {
+ ret = -EFAULT;
break;
-
- case SO_LINGER:
- if(optlen<sizeof(ling)) {
- ret = -EINVAL; /* 1003.1g */
- break;
- }
- if (copy_from_user(&ling,optval,sizeof(ling))) {
- ret = -EFAULT;
- break;
- }
- if (!ling.l_onoff)
- sock_reset_flag(sk, SOCK_LINGER);
- else {
+ }
+ if (!ling.l_onoff)
+ sock_reset_flag(sk, SOCK_LINGER);
+ else {
#if (BITS_PER_LONG == 32)
- if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
- sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
- else
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ else
#endif
- sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
- sock_set_flag(sk, SOCK_LINGER);
- }
- break;
-
- case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
- break;
-
- case SO_PASSCRED:
- if (valbool)
- set_bit(SOCK_PASSCRED, &sock->flags);
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ sock_set_flag(sk, SOCK_LINGER);
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("setsockopt");
+ break;
+
+ case SO_PASSCRED:
+ if (valbool)
+ set_bit(SOCK_PASSCRED, &sock->flags);
+ else
+ clear_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_TIMESTAMP:
+ case SO_TIMESTAMPNS:
+ if (valbool) {
+ if (optname == SO_TIMESTAMP)
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
else
- clear_bit(SOCK_PASSCRED, &sock->flags);
- break;
-
- case SO_TIMESTAMP:
- if (valbool) {
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk);
- } else
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- break;
-
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
- break;
-
- case SO_RCVTIMEO:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
- break;
+ sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+ break;
- case SO_SNDTIMEO:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ case SO_TIMESTAMPING:
+ if (val & ~SOF_TIMESTAMPING_MASK) {
+ ret = -EINVAL;
break;
-
-#ifdef CONFIG_NETDEVICES
- case SO_BINDTODEVICE:
- {
- char devname[IFNAMSIZ];
-
- /* Sorry... */
- if (!capable(CAP_NET_RAW)) {
- ret = -EPERM;
+ }
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
+ val & SOF_TIMESTAMPING_TX_HARDWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
+ val & SOF_TIMESTAMPING_TX_SOFTWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
+ val & SOF_TIMESTAMPING_RX_HARDWARE);
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
+ val & SOF_TIMESTAMPING_SOFTWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
+ val & SOF_TIMESTAMPING_SYS_HARDWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
+ val & SOF_TIMESTAMPING_RAW_HARDWARE);
+ break;
+
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+
+ case SO_RCVTIMEO:
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ break;
+
+ case SO_SNDTIMEO:
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ break;
+
+ case SO_ATTACH_FILTER:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
break;
- }
- /* Bind this socket to a particular device like "eth0",
- * as specified in the passed interface name. If the
- * name is "" or the option length is zero the socket
- * is not bound.
- */
-
- if (!valbool) {
- sk->sk_bound_dev_if = 0;
- } else {
- if (optlen > IFNAMSIZ)
- optlen = IFNAMSIZ;
- if (copy_from_user(devname, optval, optlen)) {
- ret = -EFAULT;
- break;
- }
+ ret = sk_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_DETACH_FILTER:
+ ret = sk_detach_filter(sk);
+ break;
+
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
+ case SO_PASSSEC:
+ if (valbool)
+ set_bit(SOCK_PASSSEC, &sock->flags);
+ else
+ clear_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_mark = val;
+ break;
- /* Remove any cached route for this socket. */
- sk_dst_reset(sk);
-
- if (devname[0] == '\0') {
- sk->sk_bound_dev_if = 0;
- } else {
- struct net_device *dev = dev_get_by_name(devname);
- if (!dev) {
- ret = -ENODEV;
- break;
- }
- sk->sk_bound_dev_if = dev->ifindex;
- dev_put(dev);
- }
- }
- break;
+ /* We implement the SO_SNDLOWAT etc to
+ not be settable (1003.1g 5.3) */
+ case SO_RXQ_OVFL:
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_PEEK_OFF:
+ if (sock->ops->set_peek_off)
+ ret = sock->ops->set_peek_off(sk, val);
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ /* allow unprivileged users to decrease the value */
+ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ sk->sk_ll_usec = val;
}
+ break;
#endif
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
- case SO_ATTACH_FILTER:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
-
- ret = sk_attach_filter(&fprog, sk);
- }
- break;
-
- case SO_DETACH_FILTER:
- spin_lock_bh(&sk->sk_lock.slock);
- filter = sk->sk_filter;
- if (filter) {
- sk->sk_filter = NULL;
- spin_unlock_bh(&sk->sk_lock.slock);
- sk_filter_release(sk, filter);
- break;
- }
- spin_unlock_bh(&sk->sk_lock.slock);
- ret = -ENONET;
- break;
-
- /* We implement the SO_SNDLOWAT etc to
- not be settable (1003.1g 5.3) */
- default:
- ret = -ENOPROTOOPT;
- break;
- }
+ default:
+ ret = -ENOPROTOOPT;
+ break;
+ }
release_sock(sk);
return ret;
}
+EXPORT_SYMBOL(sock_setsockopt);
+
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
+{
+ ucred->pid = pid_vnr(pid);
+ ucred->uid = ucred->gid = -1;
+ if (cred) {
+ struct user_namespace *current_ns = current_user_ns();
+
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
+ }
+}
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
-
- union
- {
- int val;
- struct linger ling;
+
+ union {
+ int val;
+ struct linger ling;
struct timeval tm;
} v;
-
- unsigned int lv = sizeof(int);
+
+ int lv = sizeof(int);
int len;
-
- if(get_user(len,optlen))
- return -EFAULT;
- if(len < 0)
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
return -EINVAL;
-
- switch(optname)
- {
- case SO_DEBUG:
- v.val = sock_flag(sk, SOCK_DBG);
- break;
-
- case SO_DONTROUTE:
- v.val = sock_flag(sk, SOCK_LOCALROUTE);
- break;
-
- case SO_BROADCAST:
- v.val = !!sock_flag(sk, SOCK_BROADCAST);
- break;
- case SO_SNDBUF:
- v.val = sk->sk_sndbuf;
- break;
-
- case SO_RCVBUF:
- v.val = sk->sk_rcvbuf;
- break;
+ memset(&v, 0, sizeof(v));
+
+ switch (optname) {
+ case SO_DEBUG:
+ v.val = sock_flag(sk, SOCK_DBG);
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sock_flag(sk, SOCK_LOCALROUTE);
+ break;
+
+ case SO_BROADCAST:
+ v.val = sock_flag(sk, SOCK_BROADCAST);
+ break;
+
+ case SO_SNDBUF:
+ v.val = sk->sk_sndbuf;
+ break;
+
+ case SO_RCVBUF:
+ v.val = sk->sk_rcvbuf;
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->sk_reuse;
+ break;
+
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
+ break;
+
+ case SO_TYPE:
+ v.val = sk->sk_type;
+ break;
+
+ case SO_PROTOCOL:
+ v.val = sk->sk_protocol;
+ break;
+
+ case SO_DOMAIN:
+ v.val = sk->sk_family;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if (v.val == 0)
+ v.val = xchg(&sk->sk_err_soft, 0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = sock_flag(sk, SOCK_URGINLINE);
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->sk_no_check_tx;
+ break;
+
+ case SO_PRIORITY:
+ v.val = sk->sk_priority;
+ break;
+
+ case SO_LINGER:
+ lv = sizeof(v.ling);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
+ v.ling.l_linger = sk->sk_lingertime / HZ;
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("getsockopt");
+ break;
+
+ case SO_TIMESTAMP:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPNS:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPING:
+ v.val = 0;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ break;
+
+ case SO_RCVTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
- case SO_REUSEADDR:
- v.val = sk->sk_reuse;
- break;
+ case SO_RCVLOWAT:
+ v.val = sk->sk_rcvlowat;
+ break;
- case SO_KEEPALIVE:
- v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
- break;
+ case SO_SNDLOWAT:
+ v.val = 1;
+ break;
- case SO_TYPE:
- v.val = sk->sk_type;
- break;
+ case SO_PASSCRED:
+ v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ break;
- case SO_ERROR:
- v.val = -sock_error(sk);
- if(v.val==0)
- v.val = xchg(&sk->sk_err_soft, 0);
- break;
+ case SO_PEERCRED:
+ {
+ struct ucred peercred;
+ if (len > sizeof(peercred))
+ len = sizeof(peercred);
+ cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+ if (copy_to_user(optval, &peercred, len))
+ return -EFAULT;
+ goto lenout;
+ }
- case SO_OOBINLINE:
- v.val = !!sock_flag(sk, SOCK_URGINLINE);
- break;
-
- case SO_NO_CHECK:
- v.val = sk->sk_no_check;
- break;
+ case SO_PEERNAME:
+ {
+ char address[128];
+
+ if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+ return -ENOTCONN;
+ if (lv < len)
+ return -EINVAL;
+ if (copy_to_user(optval, address, len))
+ return -EFAULT;
+ goto lenout;
+ }
- case SO_PRIORITY:
- v.val = sk->sk_priority;
- break;
-
- case SO_LINGER:
- lv = sizeof(v.ling);
- v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
- v.ling.l_linger = sk->sk_lingertime / HZ;
- break;
-
- case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
- break;
+ /* Dubious BSD thing... Probably nobody even uses it, but
+ * the UNIX standard wants it for whatever reason... -DaveM
+ */
+ case SO_ACCEPTCONN:
+ v.val = sk->sk_state == TCP_LISTEN;
+ break;
- case SO_TIMESTAMP:
- v.val = sock_flag(sk, SOCK_RCVTSTAMP);
- break;
+ case SO_PASSSEC:
+ v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ break;
- case SO_RCVTIMEO:
- lv=sizeof(struct timeval);
- if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
- }
- break;
+ case SO_PEERSEC:
+ return security_socket_getpeersec_stream(sock, optval, optlen, len);
- case SO_SNDTIMEO:
- lv=sizeof(struct timeval);
- if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
- }
- break;
+ case SO_MARK:
+ v.val = sk->sk_mark;
+ break;
- case SO_RCVLOWAT:
- v.val = sk->sk_rcvlowat;
- break;
+ case SO_RXQ_OVFL:
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
- case SO_SNDLOWAT:
- v.val=1;
- break;
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
- case SO_PASSCRED:
- v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
- break;
+ case SO_PEEK_OFF:
+ if (!sock->ops->set_peek_off)
+ return -EOPNOTSUPP;
- case SO_PEERCRED:
- if (len > sizeof(sk->sk_peercred))
- len = sizeof(sk->sk_peercred);
- if (copy_to_user(optval, &sk->sk_peercred, len))
- return -EFAULT;
- goto lenout;
-
- case SO_PEERNAME:
- {
- char address[128];
-
- if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
- return -ENOTCONN;
- if (lv < len)
- return -EINVAL;
- if (copy_to_user(optval, address, len))
- return -EFAULT;
- goto lenout;
- }
+ v.val = sk->sk_peek_off;
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
- /* Dubious BSD thing... Probably nobody even uses it, but
- * the UNIX standard wants it for whatever reason... -DaveM
- */
- case SO_ACCEPTCONN:
- v.val = sk->sk_state == TCP_LISTEN;
- break;
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
- case SO_PEERSEC:
- return security_socket_getpeersec(sock, optval, optlen, len);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = sk->sk_ll_usec;
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
+ break;
- default:
- return(-ENOPROTOOPT);
+ default:
+ return -ENOPROTOOPT;
}
+
if (len > lv)
len = lv;
if (copy_to_user(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
- return -EFAULT;
- return 0;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ return 0;
}
-/**
- * sk_alloc - All socket objects are allocated here
- * @family: protocol family
- * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
- * @prot: struct proto associated with this new sock instance
- * @zero_it: if we should zero the newly allocated sock
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
*/
-struct sock *sk_alloc(int family, gfp_t priority,
- struct proto *prot, int zero_it)
+static inline void sock_lock_init(struct sock *sk)
{
- struct sock *sk = NULL;
- kmem_cache_t *slab = prot->slab;
+ sock_lock_init_class_and_name(sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+ af_family_key_strings[sk->sk_family],
+ af_family_keys + sk->sk_family);
+}
- if (slab != NULL)
- sk = kmem_cache_alloc(slab, priority);
- else
- sk = kmalloc(prot->obj_size, priority);
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
+ */
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+ void *sptr = nsk->sk_security;
+#endif
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
- if (sk) {
- if (zero_it) {
- memset(sk, 0, prot->obj_size);
- sk->sk_family = family;
- /*
- * See comment in struct sock definition to understand
- * why we need sk_prot_creator -acme
- */
- sk->sk_prot = sk->sk_prot_creator = prot;
- sock_lock_init(sk);
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+#ifdef CONFIG_SECURITY_NETWORK
+ nsk->sk_security = sptr;
+ security_sk_clone(osk, nsk);
+#endif
+}
+
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+ unsigned long nulls1, nulls2;
+
+ nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+ nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+ if (nulls1 > nulls2)
+ swap(nulls1, nulls2);
+
+ if (nulls1 != 0)
+ memset((char *)sk, 0, nulls1);
+ memset((char *)sk + nulls1 + sizeof(void *), 0,
+ nulls2 - nulls1 - sizeof(void *));
+ memset((char *)sk + nulls2 + sizeof(void *), 0,
+ size - nulls2 - sizeof(void *));
+}
+EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+{
+ struct sock *sk;
+ struct kmem_cache *slab;
+
+ slab = prot->slab;
+ if (slab != NULL) {
+ sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+ if (!sk)
+ return sk;
+ if (priority & __GFP_ZERO) {
+ if (prot->clear_sk)
+ prot->clear_sk(sk, prot->obj_size);
+ else
+ sk_prot_clear_nulls(sk, prot->obj_size);
}
-
+ } else
+ sk = kmalloc(prot->obj_size, priority);
+
+ if (sk != NULL) {
+ kmemcheck_annotate_bitfield(sk, flags);
+
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
- goto out_free;
+ goto out_free_sec;
+ sk_tx_queue_clear(sk);
}
+
return sk;
+out_free_sec:
+ security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -677,68 +1344,183 @@ out_free:
return NULL;
}
-void sk_free(struct sock *sk)
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+ struct kmem_cache *slab;
+ struct module *owner;
+
+ owner = prot->owner;
+ slab = prot->slab;
+
+ security_sk_free(sk);
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ module_put(owner);
+}
+
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+void sock_update_netprioidx(struct sock *sk)
+{
+ if (in_interrupt())
+ return;
+
+ sk->sk_cgrp_prioidx = task_netprioidx(current);
+}
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
+#endif
+
+/**
+ * sk_alloc - All socket objects are allocated here
+ * @net: the applicable net namespace
+ * @family: protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ struct proto *prot)
+{
+ struct sock *sk;
+
+ sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+ if (sk) {
+ sk->sk_family = family;
+ /*
+ * See comment in struct sock definition to understand
+ * why we need sk_prot_creator -acme
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+ sock_lock_init(sk);
+ sock_net_set(sk, get_net(net));
+ atomic_set(&sk->sk_wmem_alloc, 1);
+
+ sock_update_classid(sk);
+ sock_update_netprioidx(sk);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL(sk_alloc);
+
+static void __sk_free(struct sock *sk)
{
struct sk_filter *filter;
- struct module *owner = sk->sk_prot_creator->owner;
if (sk->sk_destruct)
sk->sk_destruct(sk);
- filter = sk->sk_filter;
+ filter = rcu_dereference_check(sk->sk_filter,
+ atomic_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
- sk_filter_release(sk, filter);
- sk->sk_filter = NULL;
+ sk_filter_uncharge(sk, filter);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
if (atomic_read(&sk->sk_omem_alloc))
- printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
- __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
- security_sk_free(sk);
- if (sk->sk_prot_creator->slab != NULL)
- kmem_cache_free(sk->sk_prot_creator->slab, sk);
- else
- kfree(sk);
- module_put(owner);
+ if (sk->sk_peer_cred)
+ put_cred(sk->sk_peer_cred);
+ put_pid(sk->sk_peer_pid);
+ put_net(sock_net(sk));
+ sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+void sk_free(struct sock *sk)
+{
+ /*
+ * We subtract one from sk_wmem_alloc and can know if
+ * some packets are still in some tx queue.
+ * If not null, sock_wfree() will call __sk_free(sk) later
+ */
+ if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+/*
+ * Last sock_put should drop reference to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking reference to stopping namespace
+ * is not an option.
+ * Take reference to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+ if (sk == NULL || sk->sk_socket == NULL)
+ return;
+
+ sock_hold(sk);
+ sock_release(sk->sk_socket);
+ release_net(sock_net(sk));
+ sock_net_set(sk, get_net(&init_net));
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
+static void sk_update_clone(const struct sock *sk, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+ sock_update_memcg(newsk);
}
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
- struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+ struct sock *newsk;
+ newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
if (newsk != NULL) {
struct sk_filter *filter;
- memcpy(newsk, sk, sk->sk_prot->obj_size);
+ sock_copy(newsk, sk);
/* SANITY */
+ get_net(sock_net(newsk));
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
atomic_set(&newsk->sk_rmem_alloc, 0);
- atomic_set(&newsk->sk_wmem_alloc, 0);
+ /*
+ * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
+ */
+ atomic_set(&newsk->sk_wmem_alloc, 1);
atomic_set(&newsk->sk_omem_alloc, 0);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
- rwlock_init(&newsk->sk_dst_lock);
+ spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
+ lockdep_set_class_and_name(&newsk->sk_callback_lock,
+ af_callback_keys + newsk->sk_family,
+ af_family_clock_key_strings[newsk->sk_family]);
newsk->sk_dst_cache = NULL;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
newsk->sk_send_head = NULL;
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
sock_reset_flag(newsk, SOCK_DONE);
skb_queue_head_init(&newsk->sk_error_queue);
- filter = newsk->sk_filter;
+ filter = rcu_dereference_protected(newsk->sk_filter, 1);
if (filter != NULL)
sk_filter_charge(newsk, filter);
@@ -746,6 +1528,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL;
goto out;
@@ -753,6 +1536,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
newsk->sk_err = 0;
newsk->sk_priority = 0;
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
atomic_set(&newsk->sk_refcnt, 2);
/*
@@ -767,80 +1555,138 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
* to be taken into account in all callers. -acme
*/
sk_refcnt_debug_inc(newsk);
- newsk->sk_socket = NULL;
- newsk->sk_sleep = NULL;
+ sk_set_socket(newsk, NULL);
+ newsk->sk_wq = NULL;
+
+ sk_update_clone(sk, newsk);
if (newsk->sk_prot->sockets_allocated)
- atomic_inc(newsk->sk_prot->sockets_allocated);
+ sk_sockets_allocated_inc(newsk);
+
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
}
out:
return newsk;
}
-
-EXPORT_SYMBOL_GPL(sk_clone);
-
-void __init sk_init(void)
-{
- if (num_physpages <= 4096) {
- sysctl_wmem_max = 32767;
- sysctl_rmem_max = 32767;
- sysctl_wmem_default = 32767;
- sysctl_rmem_default = 32767;
- } else if (num_physpages >= 131072) {
- sysctl_wmem_max = 131071;
- sysctl_rmem_max = 131071;
+EXPORT_SYMBOL_GPL(sk_clone_lock);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+ __sk_dst_set(sk, dst);
+ sk->sk_route_caps = dst->dev->features;
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (sk_can_gso(sk)) {
+ if (dst->header_len) {
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ }
}
}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
/*
* Simple resource managers for sockets.
*/
-/*
- * Write buffer destructor automatically called from kfree_skb.
+/*
+ * Write buffer destructor automatically called from kfree_skb.
*/
void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
- /* In case it might be waiting for more memory. */
- atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
- if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
+ if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ /*
+ * Keep a reference on sk_wmem_alloc, this will be released
+ * after sk_write_space() call
+ */
+ atomic_sub(len - 1, &sk->sk_wmem_alloc);
sk->sk_write_space(sk);
- sock_put(sk);
+ len = 1;
+ }
+ /*
+ * if sk_wmem_alloc reaches 0, we must finish what sk_free()
+ * could not do because of in-flight packets
+ */
+ if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sock_wfree);
+
+void skb_orphan_partial(struct sk_buff *skb)
+{
+ /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+ * so we do not completely orphan skb, but transfert all
+ * accounted bytes but one, to avoid unexpected reorders.
+ */
+ if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+ || skb->destructor == tcp_wfree
+#endif
+ ) {
+ atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+ skb->truesize = 1;
+ } else {
+ skb_orphan(skb);
+ }
}
+EXPORT_SYMBOL(skb_orphan_partial);
-/*
- * Read buffer destructor automatically called from kfree_skb.
+/*
+ * Read buffer destructor automatically called from kfree_skb.
*/
void sock_rfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, len);
}
+EXPORT_SYMBOL(sock_rfree);
+void sock_edemux(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+#ifdef CONFIG_INET
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put(inet_twsk(sk));
+ else
+#endif
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sock_edemux);
-int sock_i_uid(struct sock *sk)
+kuid_t sock_i_uid(struct sock *sk)
{
- int uid;
+ kuid_t uid;
- read_lock(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
- read_unlock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
+ read_unlock_bh(&sk->sk_callback_lock);
return uid;
}
+EXPORT_SYMBOL(sock_i_uid);
unsigned long sock_i_ino(struct sock *sk)
{
unsigned long ino;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
return ino;
}
+EXPORT_SYMBOL(sock_i_ino);
/*
* Allocate a skb from the socket's send buffer.
@@ -849,7 +1695,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- struct sk_buff * skb = alloc_skb(size, priority);
+ struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -857,33 +1703,18 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
}
return NULL;
}
+EXPORT_SYMBOL(sock_wmalloc);
/*
- * Allocate a skb from the socket's receive buffer.
- */
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
- gfp_t priority)
-{
- if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
- struct sk_buff *skb = alloc_skb(size, priority);
- if (skb) {
- skb_set_owner_r(skb, sk);
- return skb;
- }
- }
- return NULL;
-}
-
-/*
* Allocate a memory block from the socket's option memory buffer.
- */
+ */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned)size <= sysctl_optmem_max &&
+ if ((unsigned int)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
- * might sleep.
+ * might sleep.
*/
atomic_add(size, &sk->sk_omem_alloc);
mem = kmalloc(size, priority);
@@ -893,6 +1724,7 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
return NULL;
}
+EXPORT_SYMBOL(sock_kmalloc);
/*
* Free an option memory block.
@@ -902,11 +1734,12 @@ void sock_kfree_s(struct sock *sk, void *mem, int size)
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
}
+EXPORT_SYMBOL(sock_kfree_s);
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think, these locks should be removed for datagram sockets.
*/
-static long sock_wait_for_wmem(struct sock * sk, long timeo)
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
@@ -917,7 +1750,7 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
if (signal_pending(current))
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
@@ -926,7 +1759,7 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
break;
timeo = schedule_timeout(timeo);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return timeo;
}
@@ -935,22 +1768,25 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
* Generic send/receive buffer handlers
*/
-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
- unsigned long header_len,
- unsigned long data_len,
- int noblock, int *errcode)
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+ unsigned long data_len, int noblock,
+ int *errcode, int max_page_order)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ unsigned long chunk;
gfp_t gfp_mask;
long timeo;
int err;
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ struct page *page;
+ int i;
- gfp_mask = sk->sk_allocation;
- if (gfp_mask & __GFP_WAIT)
- gfp_mask |= __GFP_REPEAT;
+ err = -EMSGSIZE;
+ if (npages > MAX_SKB_FRAGS)
+ goto failure;
timeo = sock_sndtimeo(sk, noblock);
- while (1) {
+ while (!skb) {
err = sock_error(sk);
if (err != 0)
goto failure;
@@ -959,54 +1795,54 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- skb = alloc_skb(header_len, sk->sk_allocation);
- if (skb) {
- int npages;
- int i;
-
- /* No pages, we're done... */
- if (!data_len)
- break;
-
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- skb->truesize += data_len;
- skb_shinfo(skb)->nr_frags = npages;
- for (i = 0; i < npages; i++) {
- struct page *page;
- skb_frag_t *frag;
-
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- err = -ENOBUFS;
- skb_shinfo(skb)->nr_frags = i;
- kfree_skb(skb);
- goto failure;
- }
-
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
- data_len -= PAGE_SIZE;
- }
+ if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ continue;
+ }
- /* Full success... */
- break;
- }
- err = -ENOBUFS;
+ err = -ENOBUFS;
+ gfp_mask = sk->sk_allocation;
+ if (gfp_mask & __GFP_WAIT)
+ gfp_mask |= __GFP_REPEAT;
+
+ skb = alloc_skb(header_len, gfp_mask);
+ if (!skb)
goto failure;
+
+ skb->truesize += data_len;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages(sk->sk_allocation |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ }
+ order--;
+ }
+ page = alloc_page(sk->sk_allocation);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
}
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = -EAGAIN;
- if (!timeo)
- goto failure;
- if (signal_pending(current))
- goto interrupted;
- timeo = sock_wait_for_wmem(sk, timeo);
}
skb_set_owner_w(skb, sk);
@@ -1015,33 +1851,96 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
interrupted:
err = sock_intr_errno(timeo);
failure:
+ kfree_skb(skb);
*errcode = err;
return NULL;
}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode)
{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+EXPORT_SYMBOL(sock_alloc_send_skb);
+
+/* On 32bit arches, an skb frag is limited to 2^15 */
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @prio: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+{
+ int order;
+
+ if (pfrag->page) {
+ if (atomic_read(&pfrag->page->_count) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ order = SKB_FRAG_PAGE_ORDER;
+ do {
+ gfp_t gfp = prio;
+
+ if (order)
+ gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+ pfrag->page = alloc_pages(gfp, order);
+ if (likely(pfrag->page)) {
+ pfrag->offset = 0;
+ pfrag->size = PAGE_SIZE << order;
+ return true;
+ }
+ } while (--order >= 0);
+
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
}
+EXPORT_SYMBOL(sk_page_frag_refill);
static void __lock_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
DEFINE_WAIT(wait);
- for(;;) {
+ for (;;) {
prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_bh(&sk->sk_lock.slock);
schedule();
spin_lock_bh(&sk->sk_lock.slock);
- if(!sock_owned_by_user(sk))
+ if (!sock_owned_by_user(sk))
break;
}
finish_wait(&sk->sk_lock.wq, &wait);
}
static void __release_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
@@ -1052,8 +1951,10 @@ static void __release_sock(struct sock *sk)
do {
struct sk_buff *next = skb->next;
+ prefetch(next);
+ WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
- sk->sk_backlog_rcv(sk, skb);
+ sk_backlog_rcv(sk, skb);
/*
* We are in process context here with softirqs
@@ -1067,7 +1968,13 @@ static void __release_sock(struct sock *sk)
} while (skb != NULL);
bh_lock_sock(sk);
- } while((skb = sk->sk_backlog.head) != NULL);
+ } while ((skb = sk->sk_backlog.head) != NULL);
+
+ /*
+ * Doing the zeroing here guarantee we can not loop forever
+ * while a wild producer attempts to flood us.
+ */
+ sk->sk_backlog.len = 0;
}
/**
@@ -1085,16 +1992,120 @@ int sk_wait_data(struct sock *sk, long *timeo)
int rc;
DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return rc;
}
-
EXPORT_SYMBOL(sk_wait_data);
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ int amt = sk_mem_pages(size);
+ long allocated;
+ int parent_status = UNDER_LIMIT;
+
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+
+ /* Under limit. */
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
+ return 1;
+ }
+
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
+
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+ return 1;
+
+ } else { /* SK_MEM_SEND */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+ return 1;
+ } else if (atomic_read(&sk->sk_wmem_alloc) <
+ prot->sysctl_wmem[0])
+ return 1;
+ }
+
+ if (sk_has_memory_pressure(sk)) {
+ int alloc;
+
+ if (!sk_under_memory_pressure(sk))
+ return 1;
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ trace_sock_exceed_buf_limit(sk, prot, allocated);
+
+ /* Alas. Undo changes. */
+ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+
+ sk_memory_allocated_sub(sk, amt);
+
+ return 0;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_reclaim - reclaim memory_allocated
+ * @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
+ sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function. In certain
@@ -1106,78 +2117,92 @@ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_bind);
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
int len, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_connect);
int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_socketpair);
int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_accept);
-int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
int *len, int peer)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_getname);
-unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
+unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
{
return 0;
}
+EXPORT_SYMBOL(sock_no_poll);
int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_ioctl);
int sock_no_listen(struct socket *sock, int backlog)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_listen);
int sock_no_shutdown(struct socket *sock, int how)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_shutdown);
int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_setsockopt);
int sock_no_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_getsockopt);
int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
size_t len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_sendmsg);
int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
size_t len, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_recvmsg);
int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
/* Mirror missing mmap method error code */
return -ENODEV;
}
+EXPORT_SYMBOL(sock_no_mmap);
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
@@ -1191,6 +2216,7 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
kunmap(page);
return res;
}
+EXPORT_SYMBOL(sock_no_sendpage);
/*
* Default Socket Callbacks
@@ -1198,47 +2224,61 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
static void sock_def_wakeup(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible_all(sk->sk_sleep);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
}
static void sock_def_error_report(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,0,POLL_ERR);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, POLLERR);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,1,POLL_IN);
- read_unlock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+ POLLRDNORM | POLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
}
static void sock_def_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
+
+ rcu_read_lock();
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
- sk_wake_async(sk, 2, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
}
static void sock_def_destruct(struct sock *sk)
@@ -1250,8 +2290,9 @@ void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
if (send_sigurg(&sk->sk_socket->file->f_owner))
- sk_wake_async(sk, 3, POLL_PRI);
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
+EXPORT_SYMBOL(sk_send_sigurg);
void sk_reset_timer(struct sock *sk, struct timer_list* timer,
unsigned long expires)
@@ -1259,15 +2300,13 @@ void sk_reset_timer(struct sock *sk, struct timer_list* timer,
if (!mod_timer(timer, expires))
sock_hold(sk);
}
-
EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (timer_pending(timer) && del_timer(timer))
+ if (del_timer(timer))
__sock_put(sk);
}
-
EXPORT_SYMBOL(sk_stop_timer);
void sock_init_data(struct socket *sock, struct sock *sk)
@@ -1275,29 +2314,34 @@ void sock_init_data(struct socket *sock, struct sock *sk)
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer);
-
+
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
- sk->sk_socket = sock;
+ sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
- if(sock)
- {
+ if (sock) {
sk->sk_type = sock->type;
- sk->sk_sleep = &sock->wait;
+ sk->sk_wq = sock->wq;
sock->sk = sk;
} else
- sk->sk_sleep = NULL;
+ sk->sk_wq = NULL;
- rwlock_init(&sk->sk_dst_lock);
+ spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
@@ -1305,68 +2349,202 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
- sk->sk_peercred.pid = 0;
- sk->sk_peercred.uid = -1;
- sk->sk_peercred.gid = -1;
+ sk->sk_peer_pid = NULL;
+ sk->sk_peer_cred = NULL;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- sk->sk_stamp.tv_sec = -1L;
- sk->sk_stamp.tv_usec = -1L;
+ sk->sk_stamp = ktime_set(-1L, 0);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = sysctl_net_busy_read;
+#endif
+ sk->sk_max_pacing_rate = ~0U;
+ sk->sk_pacing_rate = ~0U;
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
atomic_set(&sk->sk_refcnt, 1);
+ atomic_set(&sk->sk_drops, 0);
}
+EXPORT_SYMBOL(sock_init_data);
-void fastcall lock_sock(struct sock *sk)
+void lock_sock_nested(struct sock *sk, int subclass)
{
might_sleep();
- spin_lock_bh(&(sk->sk_lock.slock));
- if (sk->sk_lock.owner)
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_lock.owned)
__lock_sock(sk);
- sk->sk_lock.owner = (void *)1;
- spin_unlock_bh(&(sk->sk_lock.slock));
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+ local_bh_enable();
}
+EXPORT_SYMBOL(lock_sock_nested);
-EXPORT_SYMBOL(lock_sock);
-
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
{
- spin_lock_bh(&(sk->sk_lock.slock));
+ /*
+ * The sk_lock has mutex_unlock() semantics:
+ */
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+ spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
- sk->sk_lock.owner = NULL;
- if (waitqueue_active(&(sk->sk_lock.wq)))
- wake_up(&(sk->sk_lock.wq));
- spin_unlock_bh(&(sk->sk_lock.slock));
+
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
+ if (sk->sk_prot->release_cb)
+ sk->sk_prot->release_cb(sk);
+
+ sock_release_ownership(sk);
+ if (waitqueue_active(&sk->sk_lock.wq))
+ wake_up(&sk->sk_lock.wq);
+ spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken
+ * sk_lock.slock locked, owned = 0, BH disabled
+ * return true if slow path is taken
+ * sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+bool lock_sock_fast(struct sock *sk)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+
+ if (!sk->sk_lock.owned)
+ /*
+ * Note : We must disable BH
+ */
+ return false;
+
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+ local_bh_enable();
+ return true;
+}
+EXPORT_SYMBOL(lock_sock_fast);
+
int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{
+{
+ struct timeval tv;
if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk);
- if (sk->sk_stamp.tv_sec == -1)
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ tv = ktime_to_timeval(sk->sk_stamp);
+ if (tv.tv_sec == -1)
return -ENOENT;
- if (sk->sk_stamp.tv_sec == 0)
- do_gettimeofday(&sk->sk_stamp);
- return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
- -EFAULT : 0;
-}
+ if (tv.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ tv = ktime_to_timeval(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
EXPORT_SYMBOL(sock_get_timestamp);
-void sock_enable_timestamp(struct sock *sk)
-{
- if (!sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_set_flag(sk, SOCK_TIMESTAMP);
- net_enable_timestamp();
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+ struct timespec ts;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ ts = ktime_to_timespec(sk->sk_stamp);
+ if (ts.tv_sec == -1)
+ return -ENOENT;
+ if (ts.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ ts = ktime_to_timespec(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
+void sock_enable_timestamp(struct sock *sk, int flag)
+{
+ if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
+ sock_set_flag(sk, flag);
+ /*
+ * we just set one of the two flags which require net
+ * time stamping, but time stamping might have been on
+ * already because of the other one
+ */
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+ net_enable_timestamp();
+ }
+}
+
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->sk_error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
}
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ spin_lock_bh(&sk->sk_error_queue.lock);
+ sk->sk_err = 0;
+ if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+ sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+ sk->sk_error_report(sk);
+ } else
+ spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
}
-EXPORT_SYMBOL(sock_enable_timestamp);
+EXPORT_SYMBOL(sock_recv_errqueue);
/*
* Get a socket option on an socket.
@@ -1382,9 +2560,22 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_getsockopt);
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_getsockopt != NULL)
+ return sk->sk_prot->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
@@ -1398,22 +2589,34 @@ int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
msg->msg_namelen = addr_len;
return err;
}
-
EXPORT_SYMBOL(sock_common_recvmsg);
/*
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_setsockopt);
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_setsockopt != NULL)
+ return sk->sk_prot->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
@@ -1446,96 +2649,194 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
sock_put(sk);
}
-
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int val[PROTO_INUSE_NR];
+};
-int proto_register(struct proto *prot, int alloc_slab)
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static int __net_init sock_inuse_init_net(struct net *net)
+{
+ net->core.inuse = alloc_percpu(struct prot_inuse);
+ return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void __net_exit sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
+
+ return 0;
+}
+
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu(prot_inuse, cpu).val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ pr_err("PROTO_INUSE_NR exhausted\n");
+ return;
+ }
+
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
{
- char *request_sock_slab_name = NULL;
- char *timewait_sock_slab_name;
- int rc = -ENOBUFS;
+}
+#endif
+int proto_register(struct proto *prot, int alloc_slab)
+{
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ SLAB_HWCACHE_ALIGN | prot->slab_flags,
+ NULL);
if (prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
goto out;
}
if (prot->rsk_prot != NULL) {
- static const char mask[] = "request_sock_%s";
-
- request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
- if (request_sock_slab_name == NULL)
+ prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
+ if (prot->rsk_prot->slab_name == NULL)
goto out_free_sock_slab;
- sprintf(request_sock_slab_name, mask, prot->name);
- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+ prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
prot->rsk_prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ SLAB_HWCACHE_ALIGN, NULL);
if (prot->rsk_prot->slab == NULL) {
- printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
- prot->name);
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
goto out_free_request_sock_slab_name;
}
}
if (prot->twsk_prot != NULL) {
- static const char mask[] = "tw_sock_%s";
+ prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
- timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
-
- if (timewait_sock_slab_name == NULL)
+ if (prot->twsk_prot->twsk_slab_name == NULL)
goto out_free_request_sock_slab;
- sprintf(timewait_sock_slab_name, mask, prot->name);
prot->twsk_prot->twsk_slab =
- kmem_cache_create(timewait_sock_slab_name,
+ kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ prot->slab_flags,
+ NULL);
if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
}
}
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
- write_unlock(&proto_list_lock);
- rc = 0;
-out:
- return rc;
+ assign_proto_idx(prot);
+ mutex_unlock(&proto_list_mutex);
+ return 0;
+
out_free_timewait_sock_slab_name:
- kfree(timewait_sock_slab_name);
+ kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
if (prot->rsk_prot && prot->rsk_prot->slab) {
kmem_cache_destroy(prot->rsk_prot->slab);
prot->rsk_prot->slab = NULL;
}
out_free_request_sock_slab_name:
- kfree(request_sock_slab_name);
+ if (prot->rsk_prot)
+ kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
- goto out;
+out:
+ return -ENOBUFS;
}
-
EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
+ release_proto_idx(prot);
list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -1543,86 +2844,63 @@ void proto_unregister(struct proto *prot)
}
if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
- const char *name = kmem_cache_name(prot->rsk_prot->slab);
-
kmem_cache_destroy(prot->rsk_prot->slab);
- kfree(name);
+ kfree(prot->rsk_prot->slab_name);
prot->rsk_prot->slab = NULL;
}
if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
-
kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(name);
+ kfree(prot->twsk_prot->twsk_slab_name);
prot->twsk_prot->twsk_slab = NULL;
}
}
-
EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
-static inline struct proto *__proto_head(void)
-{
- return list_entry(proto_list.next, struct proto, node);
-}
-
-static inline struct proto *proto_head(void)
-{
- return list_empty(&proto_list) ? NULL : __proto_head();
-}
-
-static inline struct proto *proto_next(struct proto *proto)
-{
- return proto->node.next == &proto_list ? NULL :
- list_entry(proto->node.next, struct proto, node);
-}
-
-static inline struct proto *proto_get_idx(loff_t pos)
-{
- struct proto *proto;
- loff_t i = 0;
-
- list_for_each_entry(proto, &proto_list, node)
- if (i++ == pos)
- goto out;
-
- proto = NULL;
-out:
- return proto;
-}
-
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
- return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ mutex_lock(&proto_list_mutex);
+ return seq_list_start_head(&proto_list, *pos);
}
static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
+ return seq_list_next(v, &proto_list, pos);
}
static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
- seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
+
+ seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
- proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
- proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_inuse_get(seq_file_net(seq), proto),
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
@@ -1649,7 +2927,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
static int proto_seq_show(struct seq_file *seq, void *v)
{
- if (v == SEQ_START_TOKEN)
+ if (v == &proto_list)
seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
"protocol",
"size",
@@ -1661,11 +2939,11 @@ static int proto_seq_show(struct seq_file *seq, void *v)
"module",
"cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
else
- proto_seq_printf(seq, v);
+ proto_seq_printf(seq, list_entry(v, struct proto, node));
return 0;
}
-static struct seq_operations proto_seq_ops = {
+static const struct seq_operations proto_seq_ops = {
.start = proto_seq_start,
.next = proto_seq_next,
.stop = proto_seq_stop,
@@ -1674,57 +2952,42 @@ static struct seq_operations proto_seq_ops = {
static int proto_seq_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &proto_seq_ops);
+ return seq_open_net(inode, file, &proto_seq_ops,
+ sizeof(struct seq_net_private));
}
-static struct file_operations proto_seq_fops = {
+static const struct file_operations proto_seq_fops = {
.owner = THIS_MODULE,
.open = proto_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+ if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+ remove_proc_entry("protocols", net->proc_net);
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+ .init = proto_init_net,
+ .exit = proto_exit_net,
};
static int __init proto_init(void)
{
- /* register /proc/net/protocols */
- return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+ return register_pernet_subsys(&proto_net_ops);
}
subsys_initcall(proto_init);
#endif /* PROC_FS */
-
-EXPORT_SYMBOL(sk_alloc);
-EXPORT_SYMBOL(sk_free);
-EXPORT_SYMBOL(sk_send_sigurg);
-EXPORT_SYMBOL(sock_alloc_send_skb);
-EXPORT_SYMBOL(sock_init_data);
-EXPORT_SYMBOL(sock_kfree_s);
-EXPORT_SYMBOL(sock_kmalloc);
-EXPORT_SYMBOL(sock_no_accept);
-EXPORT_SYMBOL(sock_no_bind);
-EXPORT_SYMBOL(sock_no_connect);
-EXPORT_SYMBOL(sock_no_getname);
-EXPORT_SYMBOL(sock_no_getsockopt);
-EXPORT_SYMBOL(sock_no_ioctl);
-EXPORT_SYMBOL(sock_no_listen);
-EXPORT_SYMBOL(sock_no_mmap);
-EXPORT_SYMBOL(sock_no_poll);
-EXPORT_SYMBOL(sock_no_recvmsg);
-EXPORT_SYMBOL(sock_no_sendmsg);
-EXPORT_SYMBOL(sock_no_sendpage);
-EXPORT_SYMBOL(sock_no_setsockopt);
-EXPORT_SYMBOL(sock_no_shutdown);
-EXPORT_SYMBOL(sock_no_socketpair);
-EXPORT_SYMBOL(sock_rfree);
-EXPORT_SYMBOL(sock_setsockopt);
-EXPORT_SYMBOL(sock_wfree);
-EXPORT_SYMBOL(sock_wmalloc);
-EXPORT_SYMBOL(sock_i_uid);
-EXPORT_SYMBOL(sock_i_ino);
-EXPORT_SYMBOL(sysctl_optmem_max);
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_rmem_max);
-EXPORT_SYMBOL(sysctl_wmem_max);
-#endif
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 00000000000..a4216a4c957
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,231 @@
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+
+int sock_diag_check_cookie(void *sk, __u32 *cookie)
+{
+ if ((cookie[0] != INET_DIAG_NOCOOKIE ||
+ cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
+ return -ESTALE;
+ else
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(void *sk, __u32 *cookie)
+{
+ cookie[0] = (u32)(unsigned long)sk;
+ cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ u32 mem[SK_MEMINFO_VARS];
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+
+ return nla_put(skb, attrtype, sizeof(mem), &mem);
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+ struct sk_buff *skb, int attrtype)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ struct nlattr *attr;
+ unsigned int flen;
+ int err = 0;
+
+ if (!may_report_filterinfo) {
+ nla_reserve(skb, attrtype, 0);
+ return 0;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (!filter)
+ goto out;
+
+ fprog = filter->orig_prog;
+ flen = sk_filter_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
+ if (attr == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(nla_data(attr), fprog->filter, flen);
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(const struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(const struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = nlmsg_data(nlh);
+ const struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ if (req->sdiag_family >= AF_MAX)
+ return -EINVAL;
+
+ if (sock_diag_handlers[req->sdiag_family] == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, req->sdiag_family);
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[req->sdiag_family];
+ if (hndl == NULL)
+ err = -ENOENT;
+ else
+ err = hndl->dump(skb, nlh);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ return __sock_diag_rcv_msg(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+static int __net_init diag_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = sock_diag_rcv,
+ };
+
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+ return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->diag_nlsk);
+ net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+ .init = diag_net_init,
+ .exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+ return register_pernet_subsys(&diag_net_ops);
+}
+
+static void __exit sock_diag_exit(void)
+{
+ unregister_pernet_subsys(&diag_net_ops);
+}
+
+module_init(sock_diag_init);
+module_exit(sock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/stream.c b/net/core/stream.c
index 35e25259fd9..301c05f2606 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -9,7 +9,7 @@
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* (from old tcp.c code)
- * Alan Cox <alan@redhat.com> (Borrowed comments 8-))
+ * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
*/
#include <linux/module.h>
@@ -28,17 +28,21 @@
void sk_stream_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+ if (sk_stream_is_writeable(sk) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock, 2, POLL_OUT);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
}
}
-
EXPORT_SYMBOL(sk_stream_write_space);
/**
@@ -65,18 +69,17 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
- !((1 << sk->sk_state) &
+ !((1 << sk->sk_state) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
}
-
EXPORT_SYMBOL(sk_stream_wait_connect);
/**
@@ -95,16 +98,15 @@ void sk_stream_wait_close(struct sock *sk, long timeout)
DEFINE_WAIT(wait);
do {
- prepare_to_wait(sk->sk_sleep, &wait,
+ prepare_to_wait(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
}
}
-
EXPORT_SYMBOL(sk_stream_wait_close);
/**
@@ -120,12 +122,12 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
DEFINE_WAIT(wait);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
while (1) {
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
@@ -139,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, !sk->sk_err &&
- !(sk->sk_shutdown & SEND_SHUTDOWN) &&
- sk_stream_memory_free(sk) &&
- vm_wait);
+ sk_wait_event(sk, &current_timeo, sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) &&
+ !vm_wait));
sk->sk_write_pending--;
if (vm_wait) {
@@ -156,7 +158,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return err;
do_error:
@@ -169,19 +171,8 @@ do_interrupted:
err = sock_intr_errno(*timeo_p);
goto out;
}
-
EXPORT_SYMBOL(sk_stream_wait_memory);
-void sk_stream_rfree(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc += skb->truesize;
-}
-
-EXPORT_SYMBOL(sk_stream_rfree);
-
int sk_stream_error(struct sock *sk, int flags, int err)
{
if (err == -EPIPE)
@@ -190,81 +181,8 @@ int sk_stream_error(struct sock *sk, int flags, int err)
send_sig(SIGPIPE, current, 0);
return err;
}
-
EXPORT_SYMBOL(sk_stream_error);
-void __sk_stream_mem_reclaim(struct sock *sk)
-{
- if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) {
- atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM,
- sk->sk_prot->memory_allocated);
- sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1;
- if (*sk->sk_prot->memory_pressure &&
- (atomic_read(sk->sk_prot->memory_allocated) <
- sk->sk_prot->sysctl_mem[0]))
- *sk->sk_prot->memory_pressure = 0;
- }
-}
-
-EXPORT_SYMBOL(__sk_stream_mem_reclaim);
-
-int sk_stream_mem_schedule(struct sock *sk, int size, int kind)
-{
- int amt = sk_stream_pages(size);
-
- sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM;
- atomic_add(amt, sk->sk_prot->memory_allocated);
-
- /* Under limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) {
- if (*sk->sk_prot->memory_pressure)
- *sk->sk_prot->memory_pressure = 0;
- return 1;
- }
-
- /* Over hard limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) {
- sk->sk_prot->enter_memory_pressure();
- goto suppress_allocation;
- }
-
- /* Under pressure. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1])
- sk->sk_prot->enter_memory_pressure();
-
- if (kind) {
- if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0])
- return 1;
- } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0])
- return 1;
-
- if (!*sk->sk_prot->memory_pressure ||
- sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) *
- sk_stream_pages(sk->sk_wmem_queued +
- atomic_read(&sk->sk_rmem_alloc) +
- sk->sk_forward_alloc))
- return 1;
-
-suppress_allocation:
-
- if (!kind) {
- sk_stream_moderate_sndbuf(sk);
-
- /* Fail only if socket is _under_ its sndbuf.
- * In this case we cannot block, so that we have to fail.
- */
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
- return 1;
- }
-
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM;
- atomic_sub(amt, sk->sk_prot->memory_allocated);
- return 0;
-}
-
-EXPORT_SYMBOL(sk_stream_mem_schedule);
-
void sk_stream_kill_queues(struct sock *sk)
{
/* First the read buffer. */
@@ -274,18 +192,17 @@ void sk_stream_kill_queues(struct sock *sk)
__skb_queue_purge(&sk->sk_error_queue);
/* Next, the write queue. */
- BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
- BUG_TRAP(!sk->sk_wmem_queued);
- BUG_TRAP(!sk->sk_forward_alloc);
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
* have gone away, only the net layer knows can touch it.
*/
}
-
EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2f278c8e474..cf9cd13509a 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -7,128 +7,428 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
+
+#include <net/ip.h>
#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
+
+static int zero = 0;
+static int one = 1;
+static int ushort_max = USHRT_MAX;
+
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ struct ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+ lockdep_is_held(&sock_flow_mutex));
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<30) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ if (sock_table)
+ static_key_slow_inc(&rps_needed);
+ if (orig_sock_table) {
+ static_key_slow_dec(&rps_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse_user(buffer, *lenp, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ synchronize_rcu();
+ kfree(cur);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc_node(len, GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->num_buckets = netdev_flow_limit_table_len;
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ char kbuf[128];
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ goto done;
+ }
+
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = cpumask_scnprintf(kbuf, len, mask);
+ if (!len) {
+ *lenp = 0;
+ goto done;
+ }
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ if (copy_to_user(buffer, kbuf, len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
-#ifdef CONFIG_SYSCTL
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
-extern int netdev_max_backlog;
-extern int weight_p;
+static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
+ mutex_lock(&flow_limit_update_mutex);
-extern int sysctl_core_destroy_delay;
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
-#ifdef CONFIG_NET_DIVERT
-extern char sysctl_divert_version[];
-#endif /* CONFIG_NET_DIVERT */
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
-ctl_table core_table[] = {
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
+static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
- .ctl_name = NET_CORE_WMEM_MAX,
.procname = "wmem_max",
.data = &sysctl_wmem_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
- .ctl_name = NET_CORE_RMEM_MAX,
.procname = "rmem_max",
.data = &sysctl_rmem_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
- .ctl_name = NET_CORE_WMEM_DEFAULT,
.procname = "wmem_default",
.data = &sysctl_wmem_default,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
- .ctl_name = NET_CORE_RMEM_DEFAULT,
.procname = "rmem_default",
.data = &sysctl_rmem_default,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
- .ctl_name = NET_CORE_DEV_WEIGHT,
.procname = "dev_weight",
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_CORE_MAX_BACKLOG,
.procname = "netdev_max_backlog",
.data = &netdev_max_backlog,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_BPF_JIT
+ {
+ .procname = "bpf_jit_enable",
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "netdev_tstamp_prequeue",
+ .data = &netdev_tstamp_prequeue,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
{
- .ctl_name = NET_CORE_MSG_COST,
.procname = "message_cost",
- .data = &net_msg_cost,
+ .data = &net_ratelimit_state.interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = proc_dointvec_jiffies,
},
{
- .ctl_name = NET_CORE_MSG_BURST,
.procname = "message_burst",
- .data = &net_msg_burst,
+ .data = &net_ratelimit_state.burst,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_CORE_OPTMEM_MAX,
.procname = "optmem_max",
.data = &sysctl_optmem_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
},
-#ifdef CONFIG_NET_DIVERT
+#ifdef CONFIG_RPS
{
- .ctl_name = NET_CORE_DIVERT_VERSION,
- .procname = "divert_version",
- .data = (void *)sysctl_divert_version,
- .maxlen = 32,
- .mode = 0444,
- .proc_handler = &proc_dostring
+ .procname = "rps_sock_flow_entries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rps_sock_flow_sysctl
},
-#endif /* CONFIG_NET_DIVERT */
-#endif /* CONFIG_NET */
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
{
- .ctl_name = NET_CORE_SOMAXCONN,
- .procname = "somaxconn",
- .data = &sysctl_somaxconn,
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ {
+ .procname = "busy_poll",
+ .data = &sysctl_net_busy_poll,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "busy_read",
+ .data = &sysctl_net_busy_read,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
},
+#endif
+#endif /* CONFIG_NET */
{
- .ctl_name = NET_CORE_BUDGET,
.procname = "netdev_budget",
.data = &netdev_budget,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "warnings",
+ .data = &net_msg_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
- { .ctl_name = 0 }
+ { }
};
-#endif
+static struct ctl_table netns_core_table[] = {
+ {
+ .procname = "somaxconn",
+ .data = &init_net.core.sysctl_somaxconn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &ushort_max,
+ .proc_handler = proc_dointvec_minmax
+ },
+ { }
+};
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ net->core.sysctl_somaxconn = SOMAXCONN;
+
+ tbl = netns_core_table;
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+
+ tbl[0].data = &net->core.sysctl_somaxconn;
+
+ /* Don't export any sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ tbl[0].procname = NULL;
+ }
+ }
+
+ net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
+ if (net->core.sysctl_hdr == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (tbl != netns_core_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->core.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->core.sysctl_hdr);
+ BUG_ON(tbl == netns_core_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+ .init = sysctl_core_net_init,
+ .exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+ register_net_sysctl(&init_net, "net/core", net_core_table);
+ return register_pernet_subsys(&sysctl_core_ops);
+}
+
+fs_initcall(sysctl_core_init);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 00000000000..6521dfd8b7c
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,134 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+
+static unsigned int classify(const struct sk_buff *skb)
+{
+ if (likely(skb->dev && skb->dev->phydev &&
+ skb->dev->phydev->drv))
+ return ptp_classify_raw(skb);
+ else
+ return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ struct sk_buff *clone;
+ struct sock *sk = skb->sk;
+ unsigned int type;
+
+ if (!sk)
+ return;
+
+ type = classify(skb);
+
+ switch (type) {
+ case PTP_CLASS_V1_IPV4:
+ case PTP_CLASS_V1_IPV6:
+ case PTP_CLASS_V2_IPV4:
+ case PTP_CLASS_V2_IPV6:
+ case PTP_CLASS_V2_L2:
+ case PTP_CLASS_V2_VLAN:
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->txtstamp)) {
+ if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ return;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return;
+ }
+
+ clone->sk = sk;
+ phydev->drv->txtstamp(phydev, clone, type);
+ }
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ if (!hwtstamps) {
+ sock_put(sk);
+ kfree_skb(skb);
+ return;
+ }
+
+ *skb_hwtstamps(skb) = *hwtstamps;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ skb->sk = NULL;
+
+ err = sock_queue_err_skb(sk, skb);
+
+ sock_put(sk);
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ unsigned int type;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+ __skb_push(skb, ETH_HLEN);
+
+ type = classify(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ switch (type) {
+ case PTP_CLASS_V1_IPV4:
+ case PTP_CLASS_V1_IPV6:
+ case PTP_CLASS_V2_IPV4:
+ case PTP_CLASS_V2_IPV6:
+ case PTP_CLASS_V2_L2:
+ case PTP_CLASS_V2_VLAN:
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->rxtstamp))
+ return phydev->drv->rxtstamp(phydev, skb, type);
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 00000000000..8c3203c585b
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,77 @@
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/tso.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ iph = (struct iphdr *)(hdr + mac_hdr_len);
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ tcph->seq = htonl(tso->tcp_seq);
+ tso->ip_id++;
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 00000000000..1b5fefdb819
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/export.h>
+#include <net/tcp.h>
+#include <net/netdma.h>
+
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
+
+/**
+ * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * @skb - buffer to copy
+ * @offset - offset in the buffer to start copying from
+ * @iovec - io vector to copy to
+ * @len - amount of data to copy from buffer to iovec
+ * @pinned_list - locked iovec buffer data
+ *
+ * Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+ struct sk_buff *skb, int offset, struct iovec *to,
+ size_t len, struct dma_pinned_list *pinned_list)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ dma_cookie_t cookie = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+ skb->data + offset, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ struct page *page = skb_frag_page(frag);
+
+ if (copy > len)
+ copy = len;
+
+ cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+ frag->page_offset + offset - start, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
+ offset - start,
+ to, copy,
+ pinned_list);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+
+end:
+ if (!len) {
+ skb->dma_cookie = cookie;
+ return cookie;
+ }
+
+fault:
+ return -EFAULT;
+}
diff --git a/net/core/utils.c b/net/core/utils.c
index ac1d1fcf867..eed34338736 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -3,7 +3,8 @@
*
* Authors:
* net_random Alan Cox
- * net_ratelimit Andy Kleen
+ * net_ratelimit Andi Kleen
+ * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
*
* Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
@@ -16,145 +17,34 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/random.h>
#include <linux/percpu.h>
#include <linux/init.h>
+#include <linux/ratelimit.h>
+
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
-/*
- This is a maximally equidistributed combined Tausworthe generator
- based on code from GNU Scientific Library 1.5 (30 Jun 2004)
-
- x_n = (s1_n ^ s2_n ^ s3_n)
-
- s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19))
- s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25))
- s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11))
-
- The period of this generator is about 2^88.
-
- From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
- Generators", Mathematics of Computation, 65, 213 (1996), 203--213.
-
- This is available on the net from L'Ecuyer's home page,
+int net_msg_warn __read_mostly = 1;
+EXPORT_SYMBOL(net_msg_warn);
- http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
- ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
-
- There is an erratum in the paper "Tables of Maximally
- Equidistributed Combined LFSR Generators", Mathematics of
- Computation, 68, 225 (1999), 261--269:
- http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
-
- ... the k_j most significant bits of z_j must be non-
- zero, for each j. (Note: this restriction also applies to the
- computer code given in [4], but was mistakenly not mentioned in
- that paper.)
-
- This affects the seeding procedure by imposing the requirement
- s1 > 1, s2 > 7, s3 > 15.
-
-*/
-struct nrnd_state {
- u32 s1, s2, s3;
-};
-
-static DEFINE_PER_CPU(struct nrnd_state, net_rand_state);
-
-static u32 __net_random(struct nrnd_state *state)
-{
-#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
-
- state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12);
- state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4);
- state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17);
-
- return (state->s1 ^ state->s2 ^ state->s3);
-}
-
-static void __net_srandom(struct nrnd_state *state, unsigned long s)
-{
- if (s == 0)
- s = 1; /* default seed is 1 */
-
-#define LCG(n) (69069 * n)
- state->s1 = LCG(s);
- state->s2 = LCG(state->s1);
- state->s3 = LCG(state->s2);
-
- /* "warm it up" */
- __net_random(state);
- __net_random(state);
- __net_random(state);
- __net_random(state);
- __net_random(state);
- __net_random(state);
-}
-
-
-unsigned long net_random(void)
-{
- unsigned long r;
- struct nrnd_state *state = &get_cpu_var(net_rand_state);
- r = __net_random(state);
- put_cpu_var(state);
- return r;
-}
-
-
-void net_srandom(unsigned long entropy)
-{
- struct nrnd_state *state = &get_cpu_var(net_rand_state);
- __net_srandom(state, state->s1^entropy);
- put_cpu_var(state);
-}
-
-void __init net_random_init(void)
-{
- int i;
-
- for (i = 0; i < NR_CPUS; i++) {
- struct nrnd_state *state = &per_cpu(net_rand_state,i);
- __net_srandom(state, i+jiffies);
- }
-}
-
-static int net_random_reseed(void)
-{
- int i;
- unsigned long seed[NR_CPUS];
-
- get_random_bytes(seed, sizeof(seed));
- for (i = 0; i < NR_CPUS; i++) {
- struct nrnd_state *state = &per_cpu(net_rand_state,i);
- __net_srandom(state, seed[i]);
- }
- return 0;
-}
-late_initcall(net_random_reseed);
-
-int net_msg_cost = 5*HZ;
-int net_msg_burst = 10;
-
-/*
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
* All net warning printk()s should be guarded by this function.
- */
+ */
int net_ratelimit(void)
{
- return __printk_ratelimit(net_msg_cost, net_msg_burst);
+ return __ratelimit(&net_ratelimit_state);
}
-
-EXPORT_SYMBOL(net_random);
EXPORT_SYMBOL(net_ratelimit);
-EXPORT_SYMBOL(net_srandom);
/*
* Convert an ASCII string to binary IP.
@@ -169,14 +59,11 @@ __be32 in_aton(const char *str)
int i;
l = 0;
- for (i = 0; i < 4; i++)
- {
+ for (i = 0; i < 4; i++) {
l <<= 8;
- if (*str != '\0')
- {
+ if (*str != '\0') {
val = 0;
- while (*str != '\0' && *str != '.' && *str != '\n')
- {
+ while (*str != '\0' && *str != '.' && *str != '\n') {
val *= 10;
val += *str - '0';
str++;
@@ -186,7 +73,317 @@ __be32 in_aton(const char *str)
str++;
}
}
- return(htonl(l));
+ return htonl(l);
}
-
EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT 0x00010000
+#define IN6PTON_DIGIT 0x00020000
+#define IN6PTON_COLON_MASK 0x00700000
+#define IN6PTON_COLON_1 0x00100000 /* single : requested */
+#define IN6PTON_COLON_2 0x00200000 /* second : requested */
+#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
+#define IN6PTON_DOT 0x00800000 /* . */
+#define IN6PTON_DELIM 0x10000000
+#define IN6PTON_NULL 0x20000000 /* first/tail */
+#define IN6PTON_UNKNOWN 0x40000000
+
+static inline int xdigit2bin(char c, int delim)
+{
+ int val;
+
+ if (c == delim || c == '\0')
+ return IN6PTON_DELIM;
+ if (c == ':')
+ return IN6PTON_COLON_MASK;
+ if (c == '.')
+ return IN6PTON_DOT;
+
+ val = hex_to_bin(c);
+ if (val >= 0)
+ return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
+ if (delim == -1)
+ return IN6PTON_DELIM;
+ return IN6PTON_UNKNOWN;
+}
+
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in4_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s;
+ u8 *d;
+ u8 dbuf[4];
+ int ret = 0;
+ int i;
+ int w = 0;
+
+ if (srclen < 0)
+ srclen = strlen(src);
+ s = src;
+ d = dbuf;
+ i = 0;
+ while(1) {
+ int c;
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
+ goto out;
+ }
+ if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (w == 0)
+ goto out;
+ *d++ = w & 0xff;
+ w = 0;
+ i++;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (i != 4)
+ goto out;
+ break;
+ }
+ goto cont;
+ }
+ w = (w * 10) + c;
+ if ((w & 0xffff) > 255) {
+ goto out;
+ }
+cont:
+ if (i >= 4)
+ goto out;
+ s++;
+ srclen--;
+ }
+ ret = 1;
+ memcpy(dst, dbuf, sizeof(dbuf));
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in4_pton);
+
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in6_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s, *tok = NULL;
+ u8 *d, *dc = NULL;
+ u8 dbuf[16];
+ int ret = 0;
+ int i;
+ int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+ int w = 0;
+
+ memset(dbuf, 0, sizeof(dbuf));
+
+ s = src;
+ d = dbuf;
+ if (srclen < 0)
+ srclen = strlen(src);
+
+ while (1) {
+ int c;
+
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & state))
+ goto out;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ /* process one 16-bit word */
+ if (!(state & IN6PTON_NULL)) {
+ *d++ = (w >> 8) & 0xff;
+ *d++ = w & 0xff;
+ }
+ w = 0;
+ if (c & IN6PTON_DELIM) {
+ /* We've processed last word */
+ break;
+ }
+ /*
+ * COLON_1 => XDIGIT
+ * COLON_2 => XDIGIT|DELIM
+ * COLON_1_2 => COLON_2
+ */
+ switch (state & IN6PTON_COLON_MASK) {
+ case IN6PTON_COLON_2:
+ dc = d;
+ state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+ if (dc - dbuf >= sizeof(dbuf))
+ state |= IN6PTON_NULL;
+ break;
+ case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+ state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+ break;
+ case IN6PTON_COLON_1:
+ state = IN6PTON_XDIGIT;
+ break;
+ case IN6PTON_COLON_1_2:
+ state = IN6PTON_COLON_2;
+ break;
+ default:
+ state = 0;
+ }
+ tok = s + 1;
+ goto cont;
+ }
+
+ if (c & IN6PTON_DOT) {
+ ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+ if (ret > 0) {
+ d += 4;
+ break;
+ }
+ goto out;
+ }
+
+ w = (w << 4) | (0xff & c);
+ state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+ if (!(w & 0xf000)) {
+ state |= IN6PTON_XDIGIT;
+ }
+ if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_COLON_1_2;
+ state &= ~IN6PTON_DELIM;
+ }
+ if (d + 2 >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+ }
+cont:
+ if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+ d + 4 == dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_DOT;
+ }
+ if (d >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+ }
+ s++;
+ srclen--;
+ }
+
+ i = 15; d--;
+
+ if (dc) {
+ while(d >= dc)
+ dst[i--] = *d--;
+ while(i >= dc - dbuf)
+ dst[i--] = 0;
+ while(i >= 0)
+ dst[i--] = *d--;
+ } else
+ memcpy(dst, dbuf, sizeof(dbuf));
+
+ ret = 1;
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in6_pton);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, int pseudohdr)
+{
+ __be32 diff[] = { ~from, to };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
+struct __net_random_once_work {
+ struct work_struct work;
+ struct static_key *key;
+};
+
+static void __net_random_once_deferred(struct work_struct *w)
+{
+ struct __net_random_once_work *work =
+ container_of(w, struct __net_random_once_work, work);
+ BUG_ON(!static_key_enabled(work->key));
+ static_key_slow_dec(work->key);
+ kfree(work);
+}
+
+static void __net_random_once_disable_jump(struct static_key *key)
+{
+ struct __net_random_once_work *w;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return;
+
+ INIT_WORK(&w->work, __net_random_once_deferred);
+ w->key = key;
+ schedule_work(&w->work);
+}
+
+bool __net_get_random_once(void *buf, int nbytes, bool *done,
+ struct static_key *once_key)
+{
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock, flags);
+ if (*done) {
+ spin_unlock_irqrestore(&lock, flags);
+ return false;
+ }
+
+ get_random_bytes(buf, nbytes);
+ *done = true;
+ spin_unlock_irqrestore(&lock, flags);
+
+ __net_random_once_disable_jump(once_key);
+
+ return true;
+}
+EXPORT_SYMBOL(__net_get_random_once);
diff --git a/net/core/wireless.c b/net/core/wireless.c
deleted file mode 100644
index 2add7ed609e..00000000000
--- a/net/core/wireless.c
+++ /dev/null
@@ -1,1542 +0,0 @@
-/*
- * This file implement the Wireless Extensions APIs.
- *
- * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
- *
- * (As all part of the Linux kernel, this file is GPL)
- */
-
-/************************** DOCUMENTATION **************************/
-/*
- * API definition :
- * --------------
- * See <linux/wireless.h> for details of the APIs and the rest.
- *
- * History :
- * -------
- *
- * v1 - 5.12.01 - Jean II
- * o Created this file.
- *
- * v2 - 13.12.01 - Jean II
- * o Move /proc/net/wireless stuff from net/core/dev.c to here
- * o Make Wireless Extension IOCTLs go through here
- * o Added iw_handler handling ;-)
- * o Added standard ioctl description
- * o Initial dumb commit strategy based on orinoco.c
- *
- * v3 - 19.12.01 - Jean II
- * o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call
- * o Add event dispatcher function
- * o Add event description
- * o Propagate events as rtnetlink IFLA_WIRELESS option
- * o Generate event on selected SET requests
- *
- * v4 - 18.04.02 - Jean II
- * o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1
- *
- * v5 - 21.06.02 - Jean II
- * o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup)
- * o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes
- * o Add IWEVCUSTOM for driver specific event/scanning token
- * o Turn on WE_STRICT_WRITE by default + kernel warning
- * o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num)
- * o Fix off-by-one in test (extra_size <= IFNAMSIZ)
- *
- * v6 - 9.01.03 - Jean II
- * o Add common spy support : iw_handler_set_spy(), wireless_spy_update()
- * o Add enhanced spy support : iw_handler_set_thrspy() and event.
- * o Add WIRELESS_EXT version display in /proc/net/wireless
- *
- * v6 - 18.06.04 - Jean II
- * o Change get_spydata() method for added safety
- * o Remove spy #ifdef, they are always on -> cleaner code
- * o Allow any size GET request if user specifies length > max
- * and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV
- * o Start migrating get_wireless_stats to struct iw_handler_def
- * o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
- * Based on patch from Pavel Roskin <proski@gnu.org> :
- * o Fix kernel data leak to user space in private handler handling
- *
- * v7 - 18.3.05 - Jean II
- * o Remove (struct iw_point *)->pointer from events and streams
- * o Remove spy_offset from struct iw_handler_def
- * o Start deprecating dev->get_wireless_stats, output a warning
- * o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
- * o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
- */
-
-/***************************** INCLUDES *****************************/
-
-#include <linux/config.h> /* Not needed ??? */
-#include <linux/module.h>
-#include <linux/types.h> /* off_t */
-#include <linux/netdevice.h> /* struct ifreq, dev_get_by_name() */
-#include <linux/proc_fs.h>
-#include <linux/rtnetlink.h> /* rtnetlink stuff */
-#include <linux/seq_file.h>
-#include <linux/init.h> /* for __init */
-#include <linux/if_arp.h> /* ARPHRD_ETHER */
-#include <linux/etherdevice.h> /* compare_ether_addr */
-
-#include <linux/wireless.h> /* Pretty obvious */
-#include <net/iw_handler.h> /* New driver API */
-
-#include <asm/uaccess.h> /* copy_to_user() */
-
-/**************************** CONSTANTS ****************************/
-
-/* Debugging stuff */
-#undef WE_IOCTL_DEBUG /* Debug IOCTL API */
-#undef WE_EVENT_DEBUG /* Debug Event dispatcher */
-#undef WE_SPY_DEBUG /* Debug enhanced spy support */
-
-/* Options */
-#define WE_EVENT_NETLINK /* Propagate events using rtnetlink */
-#define WE_SET_EVENT /* Generate an event on some set commands */
-
-/************************* GLOBAL VARIABLES *************************/
-/*
- * You should not use global variables, because of re-entrancy.
- * On our case, it's only const, so it's OK...
- */
-/*
- * Meta-data about all the standard Wireless Extension request we
- * know about.
- */
-static const struct iw_ioctl_description standard_ioctl[] = {
- [SIOCSIWCOMMIT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWNAME - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_CHAR,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWNWID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWNWID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWFREQ - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_FREQ,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWFREQ - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_FREQ,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWMODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_UINT,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWMODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_UINT,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWSENS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWSENS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRANGE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWRANGE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_range),
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWPRIV - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWPRIV - SIOCIWFIRST] = { /* (handled directly by us) */
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCSIWSTATS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_NULL,
- },
- [SIOCGIWSTATS - SIOCIWFIRST] = { /* (handled directly by us) */
- .header_type = IW_HEADER_TYPE_NULL,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr),
- .max_tokens = IW_MAX_SPY,
- },
- [SIOCGIWSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr) +
- sizeof(struct iw_quality),
- .max_tokens = IW_MAX_SPY,
- },
- [SIOCSIWTHRSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct iw_thrspy),
- .min_tokens = 1,
- .max_tokens = 1,
- },
- [SIOCGIWTHRSPY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct iw_thrspy),
- .min_tokens = 1,
- .max_tokens = 1,
- },
- [SIOCSIWAP - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [SIOCGIWAP - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWMLME - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_mlme),
- .max_tokens = sizeof(struct iw_mlme),
- },
- [SIOCGIWAPLIST - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = sizeof(struct sockaddr) +
- sizeof(struct iw_quality),
- .max_tokens = IW_MAX_AP,
- .flags = IW_DESCR_FLAG_NOMAX,
- },
- [SIOCSIWSCAN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = 0,
- .max_tokens = sizeof(struct iw_scan_req),
- },
- [SIOCGIWSCAN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_SCAN_MAX_DATA,
- .flags = IW_DESCR_FLAG_NOMAX,
- },
- [SIOCSIWESSID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE + 1,
- .flags = IW_DESCR_FLAG_EVENT,
- },
- [SIOCGIWESSID - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE + 1,
- .flags = IW_DESCR_FLAG_DUMP,
- },
- [SIOCSIWNICKN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE + 1,
- },
- [SIOCGIWNICKN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ESSID_MAX_SIZE + 1,
- },
- [SIOCSIWRATE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRATE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRTS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRTS - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWFRAG - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWFRAG - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWTXPOW - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWTXPOW - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWRETRY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWRETRY - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWENCODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ENCODING_TOKEN_MAX,
- .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT,
- },
- [SIOCGIWENCODE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_ENCODING_TOKEN_MAX,
- .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT,
- },
- [SIOCSIWPOWER - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWPOWER - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWGENIE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [SIOCGIWGENIE - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [SIOCSIWAUTH - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCGIWAUTH - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
- },
- [SIOCSIWENCODEEXT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_encode_ext),
- .max_tokens = sizeof(struct iw_encode_ext) +
- IW_ENCODING_TOKEN_MAX,
- },
- [SIOCGIWENCODEEXT - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_encode_ext),
- .max_tokens = sizeof(struct iw_encode_ext) +
- IW_ENCODING_TOKEN_MAX,
- },
- [SIOCSIWPMKSA - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .min_tokens = sizeof(struct iw_pmksa),
- .max_tokens = sizeof(struct iw_pmksa),
- },
-};
-static const int standard_ioctl_num = (sizeof(standard_ioctl) /
- sizeof(struct iw_ioctl_description));
-
-/*
- * Meta-data about all the additional standard Wireless Extension events
- * we know about.
- */
-static const struct iw_ioctl_description standard_event[] = {
- [IWEVTXDROP - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVQUAL - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_QUAL,
- },
- [IWEVCUSTOM - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_CUSTOM_MAX,
- },
- [IWEVREGISTERED - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVEXPIRED - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_ADDR,
- },
- [IWEVGENIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVMICHAELMICFAILURE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_michaelmicfailure),
- },
- [IWEVASSOCREQIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVASSOCRESPIE - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = IW_GENERIC_IE_MAX,
- },
- [IWEVPMKIDCAND - IWEVFIRST] = {
- .header_type = IW_HEADER_TYPE_POINT,
- .token_size = 1,
- .max_tokens = sizeof(struct iw_pmkid_cand),
- },
-};
-static const int standard_event_num = (sizeof(standard_event) /
- sizeof(struct iw_ioctl_description));
-
-/* Size (in bytes) of the various private data types */
-static const char iw_priv_type_size[] = {
- 0, /* IW_PRIV_TYPE_NONE */
- 1, /* IW_PRIV_TYPE_BYTE */
- 1, /* IW_PRIV_TYPE_CHAR */
- 0, /* Not defined */
- sizeof(__u32), /* IW_PRIV_TYPE_INT */
- sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */
- sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */
- 0, /* Not defined */
-};
-
-/* Size (in bytes) of various events */
-static const int event_type_size[] = {
- IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */
- 0,
- IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */
- 0,
- IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */
- IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */
- IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */
- 0,
- IW_EV_POINT_LEN, /* Without variable payload */
- IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */
- IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */
-};
-
-/************************ COMMON SUBROUTINES ************************/
-/*
- * Stuff that may be used in various place or doesn't fit in one
- * of the section below.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Return the driver handler associated with a specific Wireless Extension.
- * Called from various place, so make sure it remains efficient.
- */
-static inline iw_handler get_handler(struct net_device *dev,
- unsigned int cmd)
-{
- /* Don't "optimise" the following variable, it will crash */
- unsigned int index; /* *MUST* be unsigned */
-
- /* Check if we have some wireless handlers defined */
- if(dev->wireless_handlers == NULL)
- return NULL;
-
- /* Try as a standard command */
- index = cmd - SIOCIWFIRST;
- if(index < dev->wireless_handlers->num_standard)
- return dev->wireless_handlers->standard[index];
-
- /* Try as a private command */
- index = cmd - SIOCIWFIRSTPRIV;
- if(index < dev->wireless_handlers->num_private)
- return dev->wireless_handlers->private[index];
-
- /* Not found */
- return NULL;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Get statistics out of the driver
- */
-static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
-{
- /* New location */
- if((dev->wireless_handlers != NULL) &&
- (dev->wireless_handlers->get_wireless_stats != NULL))
- return dev->wireless_handlers->get_wireless_stats(dev);
-
- /* Old location, field to be removed in next WE */
- if(dev->get_wireless_stats) {
- static int printed_message;
-
- if (!printed_message++)
- printk(KERN_DEBUG "%s (WE) : Driver using old /proc/net/wireless support, please fix driver !\n",
- dev->name);
-
- return dev->get_wireless_stats(dev);
- }
-
- /* Not found */
- return (struct iw_statistics *) NULL;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Call the commit handler in the driver
- * (if exist and if conditions are right)
- *
- * Note : our current commit strategy is currently pretty dumb,
- * but we will be able to improve on that...
- * The goal is to try to agreagate as many changes as possible
- * before doing the commit. Drivers that will define a commit handler
- * are usually those that need a reset after changing parameters, so
- * we want to minimise the number of reset.
- * A cool idea is to use a timer : at each "set" command, we re-set the
- * timer, when the timer eventually fires, we call the driver.
- * Hopefully, more on that later.
- *
- * Also, I'm waiting to see how many people will complain about the
- * netif_running(dev) test. I'm open on that one...
- * Hopefully, the driver will remember to do a commit in "open()" ;-)
- */
-static inline int call_commit_handler(struct net_device * dev)
-{
- if((netif_running(dev)) &&
- (dev->wireless_handlers->standard[0] != NULL)) {
- /* Call the commit handler on the driver */
- return dev->wireless_handlers->standard[0](dev, NULL,
- NULL, NULL);
- } else
- return 0; /* Command completed successfully */
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Calculate size of private arguments
- */
-static inline int get_priv_size(__u16 args)
-{
- int num = args & IW_PRIV_SIZE_MASK;
- int type = (args & IW_PRIV_TYPE_MASK) >> 12;
-
- return num * iw_priv_type_size[type];
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Re-calculate the size of private arguments
- */
-static inline int adjust_priv_size(__u16 args,
- union iwreq_data * wrqu)
-{
- int num = wrqu->data.length;
- int max = args & IW_PRIV_SIZE_MASK;
- int type = (args & IW_PRIV_TYPE_MASK) >> 12;
-
- /* Make sure the driver doesn't goof up */
- if (max < num)
- num = max;
-
- return num * iw_priv_type_size[type];
-}
-
-
-/******************** /proc/net/wireless SUPPORT ********************/
-/*
- * The /proc/net/wireless file is a human readable user-space interface
- * exporting various wireless specific statistics from the wireless devices.
- * This is the most popular part of the Wireless Extensions ;-)
- *
- * This interface is a pure clone of /proc/net/dev (in net/core/dev.c).
- * The content of the file is basically the content of "struct iw_statistics".
- */
-
-#ifdef CONFIG_PROC_FS
-
-/* ---------------------------------------------------------------- */
-/*
- * Print one entry (line) of /proc/net/wireless
- */
-static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
- struct net_device *dev)
-{
- /* Get stats from the driver */
- struct iw_statistics *stats = get_wireless_stats(dev);
-
- if (stats) {
- seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d "
- "%6d %6d %6d\n",
- dev->name, stats->status, stats->qual.qual,
- stats->qual.updated & IW_QUAL_QUAL_UPDATED
- ? '.' : ' ',
- ((__s32) stats->qual.level) -
- ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
- stats->qual.updated & IW_QUAL_LEVEL_UPDATED
- ? '.' : ' ',
- ((__s32) stats->qual.noise) -
- ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
- stats->qual.updated & IW_QUAL_NOISE_UPDATED
- ? '.' : ' ',
- stats->discard.nwid, stats->discard.code,
- stats->discard.fragment, stats->discard.retries,
- stats->discard.misc, stats->miss.beacon);
- stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
- }
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Print info for /proc/net/wireless (print all entries)
- */
-static int wireless_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_printf(seq, "Inter-| sta-| Quality | Discarded "
- "packets | Missed | WE\n"
- " face | tus | link level noise | nwid "
- "crypt frag retry misc | beacon | %d\n",
- WIRELESS_EXT);
- else
- wireless_seq_printf_stats(seq, v);
- return 0;
-}
-
-static struct seq_operations wireless_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = wireless_seq_show,
-};
-
-static int wireless_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &wireless_seq_ops);
-}
-
-static struct file_operations wireless_seq_fops = {
- .owner = THIS_MODULE,
- .open = wireless_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-int __init wireless_proc_init(void)
-{
- /* Create /proc/net/wireless entry */
- if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
- return -ENOMEM;
-
- return 0;
-}
-#endif /* CONFIG_PROC_FS */
-
-/************************** IOCTL SUPPORT **************************/
-/*
- * The original user space API to configure all those Wireless Extensions
- * is through IOCTLs.
- * In there, we check if we need to call the new driver API (iw_handler)
- * or just call the driver ioctl handler.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Allow programatic access to /proc/net/wireless even if /proc
- * doesn't exist... Also more efficient...
- */
-static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
-{
- /* Get stats from the driver */
- struct iw_statistics *stats;
-
- stats = get_wireless_stats(dev);
- if (stats != (struct iw_statistics *) NULL) {
- struct iwreq * wrq = (struct iwreq *)ifr;
-
- /* Copy statistics to the user buffer */
- if(copy_to_user(wrq->u.data.pointer, stats,
- sizeof(struct iw_statistics)))
- return -EFAULT;
-
- /* Check if we need to clear the updated flag */
- if(wrq->u.data.flags != 0)
- stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
- return 0;
- } else
- return -EOPNOTSUPP;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Export the driver private handler definition
- * They will be picked up by tools like iwpriv...
- */
-static inline int ioctl_export_private(struct net_device * dev,
- struct ifreq * ifr)
-{
- struct iwreq * iwr = (struct iwreq *) ifr;
-
- /* Check if the driver has something to export */
- if((dev->wireless_handlers->num_private_args == 0) ||
- (dev->wireless_handlers->private_args == NULL))
- return -EOPNOTSUPP;
-
- /* Check NULL pointer */
- if(iwr->u.data.pointer == NULL)
- return -EFAULT;
-
- /* Check if there is enough buffer up there */
- if(iwr->u.data.length < dev->wireless_handlers->num_private_args) {
- /* User space can't know in advance how large the buffer
- * needs to be. Give it a hint, so that we can support
- * any size buffer we want somewhat efficiently... */
- iwr->u.data.length = dev->wireless_handlers->num_private_args;
- return -E2BIG;
- }
-
- /* Set the number of available ioctls. */
- iwr->u.data.length = dev->wireless_handlers->num_private_args;
-
- /* Copy structure to the user buffer. */
- if (copy_to_user(iwr->u.data.pointer,
- dev->wireless_handlers->private_args,
- sizeof(struct iw_priv_args) * iwr->u.data.length))
- return -EFAULT;
-
- return 0;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a standard Wireless Extension handler.
- * We do various checks and also take care of moving data between
- * user space and kernel space.
- */
-static inline int ioctl_standard_call(struct net_device * dev,
- struct ifreq * ifr,
- unsigned int cmd,
- iw_handler handler)
-{
- struct iwreq * iwr = (struct iwreq *) ifr;
- const struct iw_ioctl_description * descr;
- struct iw_request_info info;
- int ret = -EINVAL;
-
- /* Get the description of the IOCTL */
- if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
- return -EOPNOTSUPP;
- descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n",
- ifr->ifr_name, cmd);
- printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not */
- if(descr->header_type != IW_HEADER_TYPE_POINT) {
-
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, &(iwr->u), NULL);
-
-#ifdef WE_SET_EVENT
- /* Generate an event to notify listeners of the change */
- if((descr->flags & IW_DESCR_FLAG_EVENT) &&
- ((ret == 0) || (ret == -EIWCOMMIT)))
- wireless_send_event(dev, cmd, &(iwr->u), NULL);
-#endif /* WE_SET_EVENT */
- } else {
- char * extra;
- int extra_size;
- int user_length = 0;
- int err;
-
- /* Calculate space needed by arguments. Always allocate
- * for max space. Easier, and won't last long... */
- extra_size = descr->max_tokens * descr->token_size;
-
- /* Check what user space is giving us */
- if(IW_IS_SET(cmd)) {
- /* Check NULL pointer */
- if((iwr->u.data.pointer == NULL) &&
- (iwr->u.data.length != 0))
- return -EFAULT;
- /* Check if number of token fits within bounds */
- if(iwr->u.data.length > descr->max_tokens)
- return -E2BIG;
- if(iwr->u.data.length < descr->min_tokens)
- return -EINVAL;
- } else {
- /* Check NULL pointer */
- if(iwr->u.data.pointer == NULL)
- return -EFAULT;
- /* Save user space buffer size for checking */
- user_length = iwr->u.data.length;
-
- /* Don't check if user_length > max to allow forward
- * compatibility. The test user_length < min is
- * implied by the test at the end. */
-
- /* Support for very large requests */
- if((descr->flags & IW_DESCR_FLAG_NOMAX) &&
- (user_length > descr->max_tokens)) {
- /* Allow userspace to GET more than max so
- * we can support any size GET requests.
- * There is still a limit : -ENOMEM. */
- extra_size = user_length * descr->token_size;
- /* Note : user_length is originally a __u16,
- * and token_size is controlled by us,
- * so extra_size won't get negative and
- * won't overflow... */
- }
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Create the kernel buffer */
- extra = kmalloc(extra_size, GFP_KERNEL);
- if (extra == NULL) {
- return -ENOMEM;
- }
-
- /* If it is a SET, get all the extra data in here */
- if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
- err = copy_from_user(extra, iwr->u.data.pointer,
- iwr->u.data.length *
- descr->token_size);
- if (err) {
- kfree(extra);
- return -EFAULT;
- }
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got %d bytes\n",
- dev->name,
- iwr->u.data.length * descr->token_size);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Call the handler */
- ret = handler(dev, &info, &(iwr->u), extra);
-
- /* If we have something to return to the user */
- if (!ret && IW_IS_GET(cmd)) {
- /* Check if there is enough buffer up there */
- if(user_length < iwr->u.data.length) {
- kfree(extra);
- return -E2BIG;
- }
-
- err = copy_to_user(iwr->u.data.pointer, extra,
- iwr->u.data.length *
- descr->token_size);
- if (err)
- ret = -EFAULT;
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n",
- dev->name,
- iwr->u.data.length * descr->token_size);
-#endif /* WE_IOCTL_DEBUG */
- }
-
-#ifdef WE_SET_EVENT
- /* Generate an event to notify listeners of the change */
- if((descr->flags & IW_DESCR_FLAG_EVENT) &&
- ((ret == 0) || (ret == -EIWCOMMIT))) {
- if(descr->flags & IW_DESCR_FLAG_RESTRICT)
- /* If the event is restricted, don't
- * export the payload */
- wireless_send_event(dev, cmd, &(iwr->u), NULL);
- else
- wireless_send_event(dev, cmd, &(iwr->u),
- extra);
- }
-#endif /* WE_SET_EVENT */
-
- /* Cleanup - I told you it wasn't that long ;-) */
- kfree(extra);
- }
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- /* Here, we will generate the appropriate event if needed */
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Wrapper to call a private Wireless Extension handler.
- * We do various checks and also take care of moving data between
- * user space and kernel space.
- * It's not as nice and slimline as the standard wrapper. The cause
- * is struct iw_priv_args, which was not really designed for the
- * job we are going here.
- *
- * IMPORTANT : This function prevent to set and get data on the same
- * IOCTL and enforce the SET/GET convention. Not doing it would be
- * far too hairy...
- * If you need to set and get data at the same time, please don't use
- * a iw_handler but process it in your ioctl handler (i.e. use the
- * old driver API).
- */
-static inline int ioctl_private_call(struct net_device * dev,
- struct ifreq * ifr,
- unsigned int cmd,
- iw_handler handler)
-{
- struct iwreq * iwr = (struct iwreq *) ifr;
- const struct iw_priv_args * descr = NULL;
- struct iw_request_info info;
- int extra_size = 0;
- int i;
- int ret = -EINVAL;
-
- /* Get the description of the IOCTL */
- for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
- if(cmd == dev->wireless_handlers->private_args[i].cmd) {
- descr = &(dev->wireless_handlers->private_args[i]);
- break;
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n",
- ifr->ifr_name, cmd);
- if(descr) {
- printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n",
- dev->name, descr->name,
- descr->set_args, descr->get_args);
- }
-#endif /* WE_IOCTL_DEBUG */
-
- /* Compute the size of the set/get arguments */
- if(descr != NULL) {
- if(IW_IS_SET(cmd)) {
- int offset = 0; /* For sub-ioctls */
- /* Check for sub-ioctl handler */
- if(descr->name[0] == '\0')
- /* Reserve one int for sub-ioctl index */
- offset = sizeof(__u32);
-
- /* Size of set arguments */
- extra_size = get_priv_size(descr->set_args);
-
- /* Does it fits in iwr ? */
- if((descr->set_args & IW_PRIV_SIZE_FIXED) &&
- ((extra_size + offset) <= IFNAMSIZ))
- extra_size = 0;
- } else {
- /* Size of get arguments */
- extra_size = get_priv_size(descr->get_args);
-
- /* Does it fits in iwr ? */
- if((descr->get_args & IW_PRIV_SIZE_FIXED) &&
- (extra_size <= IFNAMSIZ))
- extra_size = 0;
- }
- }
-
- /* Prepare the call */
- info.cmd = cmd;
- info.flags = 0;
-
- /* Check if we have a pointer to user space data or not. */
- if(extra_size == 0) {
- /* No extra arguments. Trivial to handle */
- ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u));
- } else {
- char * extra;
- int err;
-
- /* Check what user space is giving us */
- if(IW_IS_SET(cmd)) {
- /* Check NULL pointer */
- if((iwr->u.data.pointer == NULL) &&
- (iwr->u.data.length != 0))
- return -EFAULT;
-
- /* Does it fits within bounds ? */
- if(iwr->u.data.length > (descr->set_args &
- IW_PRIV_SIZE_MASK))
- return -E2BIG;
- } else {
- /* Check NULL pointer */
- if(iwr->u.data.pointer == NULL)
- return -EFAULT;
- }
-
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
- dev->name, extra_size);
-#endif /* WE_IOCTL_DEBUG */
-
- /* Always allocate for max space. Easier, and won't last
- * long... */
- extra = kmalloc(extra_size, GFP_KERNEL);
- if (extra == NULL) {
- return -ENOMEM;
- }
-
- /* If it is a SET, get all the extra data in here */
- if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
- err = copy_from_user(extra, iwr->u.data.pointer,
- extra_size);
- if (err) {
- kfree(extra);
- return -EFAULT;
- }
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got %d elem\n",
- dev->name, iwr->u.data.length);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Call the handler */
- ret = handler(dev, &info, &(iwr->u), extra);
-
- /* If we have something to return to the user */
- if (!ret && IW_IS_GET(cmd)) {
-
- /* Adjust for the actual length if it's variable,
- * avoid leaking kernel bits outside. */
- if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) {
- extra_size = adjust_priv_size(descr->get_args,
- &(iwr->u));
- }
-
- err = copy_to_user(iwr->u.data.pointer, extra,
- extra_size);
- if (err)
- ret = -EFAULT;
-#ifdef WE_IOCTL_DEBUG
- printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n",
- dev->name, iwr->u.data.length);
-#endif /* WE_IOCTL_DEBUG */
- }
-
- /* Cleanup - I told you it wasn't that long ;-) */
- kfree(extra);
- }
-
-
- /* Call commit handler if needed and defined */
- if(ret == -EIWCOMMIT)
- ret = call_commit_handler(dev);
-
- return ret;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Main IOCTl dispatcher. Called from the main networking code
- * (dev_ioctl() in net/core/dev.c).
- * Check the type of IOCTL and call the appropriate wrapper...
- */
-int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd)
-{
- struct net_device *dev;
- iw_handler handler;
-
- /* Permissions are already checked in dev_ioctl() before calling us.
- * The copy_to/from_user() of ifr is also dealt with in there */
-
- /* Make sure the device exist */
- if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
- return -ENODEV;
-
- /* A bunch of special cases, then the generic case...
- * Note that 'cmd' is already filtered in dev_ioctl() with
- * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */
- switch(cmd)
- {
- case SIOCGIWSTATS:
- /* Get Wireless Stats */
- return dev_iwstats(dev, ifr);
-
- case SIOCGIWPRIV:
- /* Check if we have some wireless handlers defined */
- if(dev->wireless_handlers != NULL) {
- /* We export to user space the definition of
- * the private handler ourselves */
- return ioctl_export_private(dev, ifr);
- }
- // ## Fall-through for old API ##
- default:
- /* Generic IOCTL */
- /* Basic check */
- if (!netif_device_present(dev))
- return -ENODEV;
- /* New driver API : try to find the handler */
- handler = get_handler(dev, cmd);
- if(handler != NULL) {
- /* Standard and private are not the same */
- if(cmd < SIOCIWFIRSTPRIV)
- return ioctl_standard_call(dev,
- ifr,
- cmd,
- handler);
- else
- return ioctl_private_call(dev,
- ifr,
- cmd,
- handler);
- }
- /* Old driver API : call driver ioctl handler */
- if (dev->do_ioctl) {
- return dev->do_ioctl(dev, ifr, cmd);
- }
- return -EOPNOTSUPP;
- }
- /* Not reached */
- return -EINVAL;
-}
-
-/************************* EVENT PROCESSING *************************/
-/*
- * Process events generated by the wireless layer or the driver.
- * Most often, the event will be propagated through rtnetlink
- */
-
-#ifdef WE_EVENT_NETLINK
-/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here.
- * It is declared in <linux/rtnetlink.h> */
-
-/* ---------------------------------------------------------------- */
-/*
- * Fill a rtnetlink message with our event data.
- * Note that we propage only the specified event and don't dump the
- * current wireless config. Dumping the wireless config is far too
- * expensive (for each parameter, the driver need to query the hardware).
- */
-static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
- struct net_device * dev,
- int type,
- char * event,
- int event_len)
-{
- struct ifinfomsg *r;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
- r = NLMSG_DATA(nlh);
- r->ifi_family = AF_UNSPEC;
- r->__ifi_pad = 0;
- r->ifi_type = dev->type;
- r->ifi_index = dev->ifindex;
- r->ifi_flags = dev->flags;
- r->ifi_change = 0; /* Wireless changes don't affect those flags */
-
- /* Add the wireless events in the netlink packet */
- RTA_PUT(skb, IFLA_WIRELESS,
- event_len, event);
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Create and broadcast and send it on the standard rtnetlink socket
- * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c
- * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field
- * within a RTM_NEWLINK event.
- */
-static inline void rtmsg_iwinfo(struct net_device * dev,
- char * event,
- int event_len)
-{
- struct sk_buff *skb;
- int size = NLMSG_GOODSIZE;
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- return;
-
- if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK,
- event, event_len) < 0) {
- kfree_skb(skb);
- return;
- }
- NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
-}
-#endif /* WE_EVENT_NETLINK */
-
-/* ---------------------------------------------------------------- */
-/*
- * Main event dispatcher. Called from other parts and drivers.
- * Send the event on the appropriate channels.
- * May be called from interrupt context.
- */
-void wireless_send_event(struct net_device * dev,
- unsigned int cmd,
- union iwreq_data * wrqu,
- char * extra)
-{
- const struct iw_ioctl_description * descr = NULL;
- int extra_len = 0;
- struct iw_event *event; /* Mallocated whole event */
- int event_len; /* Its size */
- int hdr_len; /* Size of the event header */
- int wrqu_off = 0; /* Offset in wrqu */
- /* Don't "optimise" the following variable, it will crash */
- unsigned cmd_index; /* *MUST* be unsigned */
-
- /* Get the description of the Event */
- if(cmd <= SIOCIWLAST) {
- cmd_index = cmd - SIOCIWFIRST;
- if(cmd_index < standard_ioctl_num)
- descr = &(standard_ioctl[cmd_index]);
- } else {
- cmd_index = cmd - IWEVFIRST;
- if(cmd_index < standard_event_num)
- descr = &(standard_event[cmd_index]);
- }
- /* Don't accept unknown events */
- if(descr == NULL) {
- /* Note : we don't return an error to the driver, because
- * the driver would not know what to do about it. It can't
- * return an error to the user, because the event is not
- * initiated by a user request.
- * The best the driver could do is to log an error message.
- * We will do it ourselves instead...
- */
- printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n",
- dev->name, cmd);
- return;
- }
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n",
- dev->name, cmd);
- printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
-#endif /* WE_EVENT_DEBUG */
-
- /* Check extra parameters and set extra_len */
- if(descr->header_type == IW_HEADER_TYPE_POINT) {
- /* Check if number of token fits within bounds */
- if(wrqu->data.length > descr->max_tokens) {
- printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length);
- return;
- }
- if(wrqu->data.length < descr->min_tokens) {
- printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length);
- return;
- }
- /* Calculate extra_len - extra is NULL for restricted events */
- if(extra != NULL)
- extra_len = wrqu->data.length * descr->token_size;
- /* Always at an offset in wrqu */
- wrqu_off = IW_EV_POINT_OFF;
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
-#endif /* WE_EVENT_DEBUG */
- }
-
- /* Total length of the event */
- hdr_len = event_type_size[descr->header_type];
- event_len = hdr_len + extra_len;
-
-#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len);
-#endif /* WE_EVENT_DEBUG */
-
- /* Create temporary buffer to hold the event */
- event = kmalloc(event_len, GFP_ATOMIC);
- if(event == NULL)
- return;
-
- /* Fill event */
- event->len = event_len;
- event->cmd = cmd;
- memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
- if(extra != NULL)
- memcpy(((char *) event) + hdr_len, extra, extra_len);
-
-#ifdef WE_EVENT_NETLINK
- /* rtnetlink event channel */
- rtmsg_iwinfo(dev, (char *) event, event_len);
-#endif /* WE_EVENT_NETLINK */
-
- /* Cleanup */
- kfree(event);
-
- return; /* Always success, I guess ;-) */
-}
-
-/********************** ENHANCED IWSPY SUPPORT **********************/
-/*
- * In the old days, the driver was handling spy support all by itself.
- * Now, the driver can delegate this task to Wireless Extensions.
- * It needs to use those standard spy iw_handler in struct iw_handler_def,
- * push data to us via wireless_spy_update() and include struct iw_spy_data
- * in its private part (and export it in net_device->wireless_data->spy_data).
- * One of the main advantage of centralising spy support here is that
- * it becomes much easier to improve and extend it without having to touch
- * the drivers. One example is the addition of the Spy-Threshold events.
- */
-
-/* ---------------------------------------------------------------- */
-/*
- * Return the pointer to the spy data in the driver.
- * Because this is called on the Rx path via wireless_spy_update(),
- * we want it to be efficient...
- */
-static inline struct iw_spy_data * get_spydata(struct net_device *dev)
-{
- /* This is the new way */
- if(dev->wireless_data)
- return(dev->wireless_data->spy_data);
- return NULL;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : set Spy List
- */
-int iw_handler_set_spy(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct sockaddr * address = (struct sockaddr *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Disable spy collection while we copy the addresses.
- * While we copy addresses, any call to wireless_spy_update()
- * will NOP. This is OK, as anyway the addresses are changing. */
- spydata->spy_number = 0;
-
- /* We want to operate without locking, because wireless_spy_update()
- * most likely will happen in the interrupt handler, and therefore
- * have its own locking constraints and needs performance.
- * The rtnl_lock() make sure we don't race with the other iw_handlers.
- * This make sure wireless_spy_update() "see" that the spy list
- * is temporarily disabled. */
- wmb();
-
- /* Are there are addresses to copy? */
- if(wrqu->data.length > 0) {
- int i;
-
- /* Copy addresses */
- for(i = 0; i < wrqu->data.length; i++)
- memcpy(spydata->spy_address[i], address[i].sa_data,
- ETH_ALEN);
- /* Reset stats */
- memset(spydata->spy_stat, 0,
- sizeof(struct iw_quality) * IW_MAX_SPY);
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_spy() : wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length);
- for (i = 0; i < wrqu->data.length; i++)
- printk(KERN_DEBUG
- "%02X:%02X:%02X:%02X:%02X:%02X \n",
- spydata->spy_address[i][0],
- spydata->spy_address[i][1],
- spydata->spy_address[i][2],
- spydata->spy_address[i][3],
- spydata->spy_address[i][4],
- spydata->spy_address[i][5]);
-#endif /* WE_SPY_DEBUG */
- }
-
- /* Make sure above is updated before re-enabling */
- wmb();
-
- /* Enable addresses */
- spydata->spy_number = wrqu->data.length;
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : get Spy List
- */
-int iw_handler_get_spy(struct net_device * dev,
- struct iw_request_info * info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct sockaddr * address = (struct sockaddr *) extra;
- int i;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- wrqu->data.length = spydata->spy_number;
-
- /* Copy addresses. */
- for(i = 0; i < spydata->spy_number; i++) {
- memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN);
- address[i].sa_family = AF_UNIX;
- }
- /* Copy stats to the user buffer (just after). */
- if(spydata->spy_number > 0)
- memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number),
- spydata->spy_stat,
- sizeof(struct iw_quality) * spydata->spy_number);
- /* Reset updated flags. */
- for(i = 0; i < spydata->spy_number; i++)
- spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : set spy threshold
- */
-int iw_handler_set_thrspy(struct net_device * dev,
- struct iw_request_info *info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Just do it */
- memcpy(&(spydata->spy_thr_low), &(threshold->low),
- 2 * sizeof(struct iw_quality));
-
- /* Clear flag */
- memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under));
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level);
-#endif /* WE_SPY_DEBUG */
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Standard Wireless Handler : get spy threshold
- */
-int iw_handler_get_thrspy(struct net_device * dev,
- struct iw_request_info *info,
- union iwreq_data * wrqu,
- char * extra)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return -EOPNOTSUPP;
-
- /* Just do it */
- memcpy(&(threshold->low), &(spydata->spy_thr_low),
- 2 * sizeof(struct iw_quality));
-
- return 0;
-}
-
-/*------------------------------------------------------------------*/
-/*
- * Prepare and send a Spy Threshold event
- */
-static void iw_send_thrspy_event(struct net_device * dev,
- struct iw_spy_data * spydata,
- unsigned char * address,
- struct iw_quality * wstats)
-{
- union iwreq_data wrqu;
- struct iw_thrspy threshold;
-
- /* Init */
- wrqu.data.length = 1;
- wrqu.data.flags = 0;
- /* Copy address */
- memcpy(threshold.addr.sa_data, address, ETH_ALEN);
- threshold.addr.sa_family = ARPHRD_ETHER;
- /* Copy stats */
- memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality));
- /* Copy also thresholds */
- memcpy(&(threshold.low), &(spydata->spy_thr_low),
- 2 * sizeof(struct iw_quality));
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n",
- threshold.addr.sa_data[0],
- threshold.addr.sa_data[1],
- threshold.addr.sa_data[2],
- threshold.addr.sa_data[3],
- threshold.addr.sa_data[4],
- threshold.addr.sa_data[5], threshold.qual.level);
-#endif /* WE_SPY_DEBUG */
-
- /* Send event to user space */
- wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold);
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Call for the driver to update the spy data.
- * For now, the spy data is a simple array. As the size of the array is
- * small, this is good enough. If we wanted to support larger number of
- * spy addresses, we should use something more efficient...
- */
-void wireless_spy_update(struct net_device * dev,
- unsigned char * address,
- struct iw_quality * wstats)
-{
- struct iw_spy_data * spydata = get_spydata(dev);
- int i;
- int match = -1;
-
- /* Make sure driver is not buggy or using the old API */
- if(!spydata)
- return;
-
-#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "wireless_spy_update() : wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
-#endif /* WE_SPY_DEBUG */
-
- /* Update all records that match */
- for(i = 0; i < spydata->spy_number; i++)
- if(!compare_ether_addr(address, spydata->spy_address[i])) {
- memcpy(&(spydata->spy_stat[i]), wstats,
- sizeof(struct iw_quality));
- match = i;
- }
-
- /* Generate an event if we cross the spy threshold.
- * To avoid event storms, we have a simple hysteresis : we generate
- * event only when we go under the low threshold or above the
- * high threshold. */
- if(match >= 0) {
- if(spydata->spy_thr_under[match]) {
- if(wstats->level > spydata->spy_thr_high.level) {
- spydata->spy_thr_under[match] = 0;
- iw_send_thrspy_event(dev, spydata,
- address, wstats);
- }
- } else {
- if(wstats->level < spydata->spy_thr_low.level) {
- spydata->spy_thr_under[match] = 1;
- iw_send_thrspy_event(dev, spydata,
- address, wstats);
- }
- }
- }
-}
-
-EXPORT_SYMBOL(iw_handler_get_spy);
-EXPORT_SYMBOL(iw_handler_get_thrspy);
-EXPORT_SYMBOL(iw_handler_set_spy);
-EXPORT_SYMBOL(iw_handler_set_thrspy);
-EXPORT_SYMBOL(wireless_send_event);
-EXPORT_SYMBOL(wireless_spy_update);