aboutsummaryrefslogtreecommitdiff
path: root/net/packet
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/Kconfig8
-rw-r--r--net/packet/Makefile2
-rw-r--r--net/packet/af_packet.c1104
-rw-r--r--net/packet/diag.c264
-rw-r--r--net/packet/internal.h126
5 files changed, 1044 insertions, 460 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
index 0060e3b396b..cc55b35f80e 100644
--- a/net/packet/Kconfig
+++ b/net/packet/Kconfig
@@ -14,3 +14,11 @@ config PACKET
be called af_packet.
If unsure, say Y.
+
+config PACKET_DIAG
+ tristate "Packet: sockets monitoring interface"
+ depends on PACKET
+ default n
+ ---help---
+ Support for PF_PACKET sockets monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/packet/Makefile b/net/packet/Makefile
index 81183eabfde..9df61347a3c 100644
--- a/net/packet/Makefile
+++ b/net/packet/Makefile
@@ -3,3 +3,5 @@
#
obj-$(CONFIG_PACKET) += af_packet.o
+obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o
+af_packet_diag-y += diag.o
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f661745df0..b85c67ccb79 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,11 +88,13 @@
#include <linux/virtio_net.h>
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
-
+#include <linux/percpu.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
+#include "internal.h"
+
/*
Assumptions:
- if device has no dev->hard_header routine, it adds and removes ll header
@@ -146,14 +148,6 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it)
/* Private packet socket structures. */
-struct packet_mclist {
- struct packet_mclist *next;
- int ifindex;
- int count;
- unsigned short type;
- unsigned short alen;
- unsigned char addr[MAX_ADDR_LEN];
-};
/* identical to struct packet_mreq except it has
* a longer address field.
*/
@@ -164,10 +158,16 @@ struct packet_mreq_max {
unsigned char mr_address[MAX_ADDR_LEN];
};
+union tpacket_uhdr {
+ struct tpacket_hdr *h1;
+ struct tpacket2_hdr *h2;
+ struct tpacket3_hdr *h3;
+ void *raw;
+};
+
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring);
-
#define V3_ALIGNMENT (8)
#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
@@ -175,63 +175,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
#define BLK_PLUS_PRIV(sz_of_priv) \
(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
-/* kbdq - kernel block descriptor queue */
-struct tpacket_kbdq_core {
- struct pgv *pkbdq;
- unsigned int feature_req_word;
- unsigned int hdrlen;
- unsigned char reset_pending_on_curr_blk;
- unsigned char delete_blk_timer;
- unsigned short kactive_blk_num;
- unsigned short blk_sizeof_priv;
-
- /* last_kactive_blk_num:
- * trick to see if user-space has caught up
- * in order to avoid refreshing timer when every single pkt arrives.
- */
- unsigned short last_kactive_blk_num;
-
- char *pkblk_start;
- char *pkblk_end;
- int kblk_size;
- unsigned int knum_blocks;
- uint64_t knxt_seq_num;
- char *prev;
- char *nxt_offset;
- struct sk_buff *skb;
-
- atomic_t blk_fill_in_prog;
-
- /* Default is set to 8ms */
-#define DEFAULT_PRB_RETIRE_TOV (8)
-
- unsigned short retire_blk_tov;
- unsigned short version;
- unsigned long tov_in_jiffies;
-
- /* timer to retire an outstanding block */
- struct timer_list retire_blk_timer;
-};
-
#define PGV_FROM_VMALLOC 1
-struct pgv {
- char *buffer;
-};
-
-struct packet_ring_buffer {
- struct pgv *pg_vec;
- unsigned int head;
- unsigned int frames_per_block;
- unsigned int frame_size;
- unsigned int frame_max;
-
- unsigned int pg_vec_order;
- unsigned int pg_vec_pages;
- unsigned int pg_vec_len;
-
- struct tpacket_kbdq_core prb_bdqc;
- atomic_t pending;
-};
#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
@@ -243,6 +187,8 @@ struct packet_ring_buffer {
struct packet_sock;
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
static void *packet_previous_frame(struct packet_sock *po,
struct packet_ring_buffer *rb,
@@ -269,52 +215,6 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
struct tpacket3_hdr *);
static void packet_flush_mclist(struct sock *sk);
-struct packet_fanout;
-struct packet_sock {
- /* struct sock has to be the first member of packet_sock */
- struct sock sk;
- struct packet_fanout *fanout;
- struct tpacket_stats stats;
- union tpacket_stats_u stats_u;
- struct packet_ring_buffer rx_ring;
- struct packet_ring_buffer tx_ring;
- int copy_thresh;
- spinlock_t bind_lock;
- struct mutex pg_vec_lock;
- unsigned int running:1, /* prot_hook is attached*/
- auxdata:1,
- origdev:1,
- has_vnet_hdr:1;
- int ifindex; /* bound device */
- __be16 num;
- struct packet_mclist *mclist;
- atomic_t mapped;
- enum tpacket_versions tp_version;
- unsigned int tp_hdrlen;
- unsigned int tp_reserve;
- unsigned int tp_loss:1;
- unsigned int tp_tstamp;
- struct packet_type prot_hook ____cacheline_aligned_in_smp;
-};
-
-#define PACKET_FANOUT_MAX 256
-
-struct packet_fanout {
-#ifdef CONFIG_NET_NS
- struct net *net;
-#endif
- unsigned int num_members;
- u16 id;
- u8 type;
- u8 defrag;
- atomic_t rr_cur;
- struct list_head list;
- struct sock *arr[PACKET_FANOUT_MAX];
- spinlock_t lock;
- atomic_t sk_ref;
- struct packet_type prot_hook ____cacheline_aligned_in_smp;
-};
-
struct packet_skb_cb {
unsigned int origlen;
union {
@@ -334,13 +234,101 @@ struct packet_skb_cb {
(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
((x)->kactive_blk_num+1) : 0)
-static struct packet_sock *pkt_sk(struct sock *sk)
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
+static void __fanout_link(struct sock *sk, struct packet_sock *po);
+
+static int packet_direct_xmit(struct sk_buff *skb)
{
- return (struct packet_sock *)sk;
+ struct net_device *dev = skb->dev;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ netdev_features_t features;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ u16 queue_map;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ features = netif_skb_features(skb);
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto drop;
+
+ queue_map = skb_get_queue_mapping(skb);
+ txq = netdev_get_tx_queue(dev, queue_map);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq)) {
+ ret = ops->ndo_start_xmit(skb, dev);
+ if (ret == NETDEV_TX_OK)
+ txq_trans_update(txq);
+ }
+ HARD_TX_UNLOCK(dev, txq);
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
}
-static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
-static void __fanout_link(struct sock *sk, struct packet_sock *po);
+static struct net_device *packet_cached_dev_get(struct packet_sock *po)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = rcu_dereference(po->cached_dev);
+ if (likely(dev))
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ return dev;
+}
+
+static void packet_cached_dev_assign(struct packet_sock *po,
+ struct net_device *dev)
+{
+ rcu_assign_pointer(po->cached_dev, dev);
+}
+
+static void packet_cached_dev_reset(struct packet_sock *po)
+{
+ RCU_INIT_POINTER(po->cached_dev, NULL);
+}
+
+static bool packet_use_direct_xmit(const struct packet_sock *po)
+{
+ return po->xmit == packet_direct_xmit;
+}
+
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+
+static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ u16 queue_index;
+
+ if (ops->ndo_select_queue) {
+ queue_index = ops->ndo_select_queue(dev, skb, NULL,
+ __packet_pick_tx_queue);
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ } else {
+ queue_index = __packet_pick_tx_queue(dev, skb);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+}
/* register_prot_hook must be invoked with the po->bind_lock held,
* or from a context in which asynchronous accesses to the packet
@@ -349,11 +337,13 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
static void register_prot_hook(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
+
if (!po->running) {
if (po->fanout)
__fanout_link(sk, po);
else
dev_add_pack(&po->prot_hook);
+
sock_hold(sk);
po->running = 1;
}
@@ -371,10 +361,12 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
struct packet_sock *po = pkt_sk(sk);
po->running = 0;
+
if (po->fanout)
__fanout_unlink(sk, po);
else
__dev_remove_pack(&po->prot_hook);
+
__sock_put(sk);
if (sync) {
@@ -401,11 +393,7 @@ static inline __pure struct page *pgv_to_page(void *addr)
static void __packet_set_status(struct packet_sock *po, void *frame, int status)
{
- union {
- struct tpacket_hdr *h1;
- struct tpacket2_hdr *h2;
- void *raw;
- } h;
+ union tpacket_uhdr h;
h.raw = frame;
switch (po->tp_version) {
@@ -428,11 +416,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
static int __packet_get_status(struct packet_sock *po, void *frame)
{
- union {
- struct tpacket_hdr *h1;
- struct tpacket2_hdr *h2;
- void *raw;
- } h;
+ union tpacket_uhdr h;
smp_rmb();
@@ -452,17 +436,66 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
}
}
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
+ unsigned int flags)
+{
+ struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+
+ if (shhwtstamps) {
+ if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
+ return TP_STATUS_TS_SYS_HARDWARE;
+ if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
+ return TP_STATUS_TS_RAW_HARDWARE;
+ }
+
+ if (ktime_to_timespec_cond(skb->tstamp, ts))
+ return TP_STATUS_TS_SOFTWARE;
+
+ return 0;
+}
+
+static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
+ struct sk_buff *skb)
+{
+ union tpacket_uhdr h;
+ struct timespec ts;
+ __u32 ts_status;
+
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ return 0;
+
+ h.raw = frame;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ h.h1->tp_sec = ts.tv_sec;
+ h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ break;
+ case TPACKET_V2:
+ h.h2->tp_sec = ts.tv_sec;
+ h.h2->tp_nsec = ts.tv_nsec;
+ break;
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ }
+
+ /* one flush is safe, as both fields always lie on the same cacheline */
+ flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
+ smp_wmb();
+
+ return ts_status;
+}
+
static void *packet_lookup_frame(struct packet_sock *po,
struct packet_ring_buffer *rb,
unsigned int position,
int status)
{
unsigned int pg_vec_pos, frame_offset;
- union {
- struct tpacket_hdr *h1;
- struct tpacket2_hdr *h2;
- void *raw;
- } h;
+ union tpacket_uhdr h;
pg_vec_pos = position / rb->frames_per_block;
frame_offset = position % rb->frames_per_block;
@@ -494,11 +527,12 @@ static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
{
struct tpacket_kbdq_core *pkc;
- pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+ pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+ GET_PBDQC_FROM_RB(&po->rx_ring);
- spin_lock(&rb_queue->lock);
+ spin_lock_bh(&rb_queue->lock);
pkc->delete_blk_timer = 1;
- spin_unlock(&rb_queue->lock);
+ spin_unlock_bh(&rb_queue->lock);
prb_del_retire_blk_timer(pkc);
}
@@ -520,7 +554,8 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
if (tx_ring)
BUG();
- pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+ pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+ GET_PBDQC_FROM_RB(&po->rx_ring);
prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
}
@@ -531,6 +566,7 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
struct ethtool_cmd ecmd;
int err;
+ u32 speed;
rtnl_lock();
dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
@@ -539,25 +575,18 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
return DEFAULT_PRB_RETIRE_TOV;
}
err = __ethtool_get_settings(dev, &ecmd);
+ speed = ethtool_cmd_speed(&ecmd);
rtnl_unlock();
if (!err) {
- switch (ecmd.speed) {
- case SPEED_10000:
- msec = 1;
- div = 10000/1000;
- break;
- case SPEED_1000:
- msec = 1;
- div = 1000/1000;
- break;
/*
* If the link speed is so slow you don't really
* need to worry about perf anyways
*/
- case SPEED_100:
- case SPEED_10:
- default:
+ if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
return DEFAULT_PRB_RETIRE_TOV;
+ } else {
+ msec = 1;
+ div = speed / 1000;
}
}
@@ -584,7 +613,7 @@ static void init_prb_bdqc(struct packet_sock *po,
struct pgv *pg_vec,
union tpacket_req_u *req_u, int tx_ring)
{
- struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
+ struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;
memset(p1, 0x0, sizeof(*p1));
@@ -592,13 +621,13 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->knxt_seq_num = 1;
p1->pkbdq = pg_vec;
pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
- p1->pkblk_start = (char *)pg_vec[0].buffer;
+ p1->pkblk_start = pg_vec[0].buffer;
p1->kblk_size = req_u->req3.tp_block_size;
p1->knum_blocks = req_u->req3.tp_block_nr;
p1->hdrlen = po->tp_hdrlen;
p1->version = po->tp_version;
p1->last_kactive_blk_num = 0;
- po->stats_u.stats3.tp_freeze_q_cnt = 0;
+ po->stats.stats3.tp_freeze_q_cnt = 0;
if (req_u->req3.tp_retire_blk_tov)
p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
else
@@ -648,7 +677,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
static void prb_retire_rx_blk_timer_expired(unsigned long data)
{
struct packet_sock *po = (struct packet_sock *)data;
- struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
+ struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
unsigned int frozen;
struct tpacket_block_desc *pbd;
@@ -766,7 +795,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket3_hdr *last_pkt;
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
- if (po->stats.tp_drops)
+ if (po->stats.stats3.tp_drops)
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
@@ -812,37 +841,33 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
smp_rmb();
- if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
+ /* We could have just memset this but we will lose the
+ * flexibility of making the priv area sticky
+ */
- /* We could have just memset this but we will lose the
- * flexibility of making the priv area sticky
- */
- BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
- BLOCK_NUM_PKTS(pbd1) = 0;
- BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
- getnstimeofday(&ts);
- h1->ts_first_pkt.ts_sec = ts.tv_sec;
- h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
- pkc1->pkblk_start = (char *)pbd1;
- pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
- BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
- BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
- BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
- pbd1->version = pkc1->version;
- pkc1->prev = pkc1->nxt_offset;
- pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
- prb_thaw_queue(pkc1);
- _prb_refresh_rx_retire_blk_timer(pkc1);
-
- smp_wmb();
+ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
+ BLOCK_NUM_PKTS(pbd1) = 0;
+ BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
- return;
- }
+ getnstimeofday(&ts);
+
+ h1->ts_first_pkt.ts_sec = ts.tv_sec;
+ h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
- WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
- pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
- dump_stack();
- BUG();
+ pkc1->pkblk_start = (char *)pbd1;
+ pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+
+ BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+ BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
+
+ pbd1->version = pkc1->version;
+ pkc1->prev = pkc1->nxt_offset;
+ pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
+
+ prb_thaw_queue(pkc1);
+ _prb_refresh_rx_retire_blk_timer(pkc1);
+
+ smp_wmb();
}
/*
@@ -872,7 +897,7 @@ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
struct packet_sock *po)
{
pkc->reset_pending_on_curr_blk = 1;
- po->stats_u.stats3.tp_freeze_q_cnt++;
+ po->stats.stats3.tp_freeze_q_cnt++;
}
#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
@@ -933,10 +958,6 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
prb_close_block(pkc, pbd, po, status);
return;
}
-
- WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
- dump_stack();
- BUG();
}
static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
@@ -959,7 +980,7 @@ static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
struct tpacket3_hdr *ppd)
{
- ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
+ ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
}
static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
@@ -973,15 +994,19 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
{
if (vlan_tx_tag_present(pkc->skb)) {
ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
- ppd->tp_status = TP_STATUS_VLAN_VALID;
+ ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
+ ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
} else {
- ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
+ ppd->hv1.tp_vlan_tci = 0;
+ ppd->hv1.tp_vlan_tpid = 0;
+ ppd->tp_status = TP_STATUS_AVAILABLE;
}
}
static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
struct tpacket3_hdr *ppd)
{
+ ppd->hv1.tp_padding = 0;
prb_fill_vlan_info(pkc, ppd);
if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
@@ -1018,7 +1043,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct tpacket_block_desc *pbd;
char *curr, *end;
- pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
/* Queue is frozen when user space is lagging behind */
@@ -1044,7 +1069,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po,
smp_mb();
curr = pkc->nxt_offset;
pkc->skb = skb;
- end = (char *) ((char *)pbd + pkc->kblk_size);
+ end = (char *)pbd + pkc->kblk_size;
/* first try the current block */
if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
@@ -1086,17 +1111,17 @@ static void *packet_current_rx_frame(struct packet_sock *po,
default:
WARN(1, "TPACKET version not supported\n");
BUG();
- return 0;
+ return NULL;
}
}
static void *prb_lookup_block(struct packet_sock *po,
struct packet_ring_buffer *rb,
- unsigned int previous,
+ unsigned int idx,
int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
- struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
+ struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
if (status != BLOCK_STATUS(pbd))
return NULL;
@@ -1160,6 +1185,70 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
}
+static void packet_inc_pending(struct packet_ring_buffer *rb)
+{
+ this_cpu_inc(*rb->pending_refcnt);
+}
+
+static void packet_dec_pending(struct packet_ring_buffer *rb)
+{
+ this_cpu_dec(*rb->pending_refcnt);
+}
+
+static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
+{
+ unsigned int refcnt = 0;
+ int cpu;
+
+ /* We don't use pending refcount in rx_ring. */
+ if (rb->pending_refcnt == NULL)
+ return 0;
+
+ for_each_possible_cpu(cpu)
+ refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
+
+ return refcnt;
+}
+
+static int packet_alloc_pending(struct packet_sock *po)
+{
+ po->rx_ring.pending_refcnt = NULL;
+
+ po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
+ if (unlikely(po->tx_ring.pending_refcnt == NULL))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static void packet_free_pending(struct packet_sock *po)
+{
+ free_percpu(po->tx_ring.pending_refcnt);
+}
+
+static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+ struct sock *sk = &po->sk;
+ bool has_room;
+
+ if (po->prot_hook.func != tpacket_rcv)
+ return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
+ <= sk->sk_rcvbuf;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ if (po->tp_version == TPACKET_V3)
+ has_room = prb_lookup_block(po, &po->rx_ring,
+ po->rx_ring.prb_bdqc.kactive_blk_num,
+ TP_STATUS_KERNEL);
+ else
+ has_room = packet_lookup_frame(po, &po->rx_ring,
+ po->rx_ring.head,
+ TP_STATUS_KERNEL);
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ return has_room;
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1185,16 +1274,16 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
return x;
}
-static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_hash(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
{
- u32 idx, hash = skb->rxhash;
-
- idx = ((u64)hash * num) >> 32;
-
- return f->arr[idx];
+ return reciprocal_scale(skb_get_hash(skb), num);
}
-static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_lb(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
{
int cur, old;
@@ -1202,14 +1291,54 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
while ((old = atomic_cmpxchg(&f->rr_cur, cur,
fanout_rr_next(f, num))) != cur)
cur = old;
- return f->arr[cur];
+ return cur;
+}
+
+static unsigned int fanout_demux_cpu(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return smp_processor_id() % num;
}
-static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_rnd(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
{
- unsigned int cpu = smp_processor_id();
+ return prandom_u32_max(num);
+}
+
+static unsigned int fanout_demux_rollover(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int idx, unsigned int skip,
+ unsigned int num)
+{
+ unsigned int i, j;
+
+ i = j = min_t(int, f->next[idx], num - 1);
+ do {
+ if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ if (i != j)
+ f->next[idx] = i;
+ return i;
+ }
+ if (++i == num)
+ i = 0;
+ } while (i != j);
+
+ return idx;
+}
+
+static unsigned int fanout_demux_qm(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return skb_get_queue_mapping(skb) % num;
+}
- return f->arr[cpu % num];
+static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
+{
+ return f->flags & (flag >> 8);
}
static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
@@ -1218,7 +1347,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
struct packet_fanout *f = pt->af_packet_priv;
unsigned int num = f->num_members;
struct packet_sock *po;
- struct sock *sk;
+ unsigned int idx;
if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
!num) {
@@ -1229,28 +1358,42 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
switch (f->type) {
case PACKET_FANOUT_HASH:
default:
- if (f->defrag) {
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
if (!skb)
return 0;
}
- skb_get_rxhash(skb);
- sk = fanout_demux_hash(f, skb, num);
+ idx = fanout_demux_hash(f, skb, num);
break;
case PACKET_FANOUT_LB:
- sk = fanout_demux_lb(f, skb, num);
+ idx = fanout_demux_lb(f, skb, num);
break;
case PACKET_FANOUT_CPU:
- sk = fanout_demux_cpu(f, skb, num);
+ idx = fanout_demux_cpu(f, skb, num);
+ break;
+ case PACKET_FANOUT_RND:
+ idx = fanout_demux_rnd(f, skb, num);
+ break;
+ case PACKET_FANOUT_QM:
+ idx = fanout_demux_qm(f, skb, num);
+ break;
+ case PACKET_FANOUT_ROLLOVER:
+ idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
break;
}
- po = pkt_sk(sk);
+ po = pkt_sk(f->arr[idx]);
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
+ unlikely(!packet_rcv_has_room(po, skb))) {
+ idx = fanout_demux_rollover(f, skb, idx, idx, num);
+ po = pkt_sk(f->arr[idx]);
+ }
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}
-static DEFINE_MUTEX(fanout_mutex);
+DEFINE_MUTEX(fanout_mutex);
+EXPORT_SYMBOL_GPL(fanout_mutex);
static LIST_HEAD(fanout_list);
static void __fanout_link(struct sock *sk, struct packet_sock *po)
@@ -1280,18 +1423,31 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
spin_unlock(&f->lock);
}
+static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
+{
+ if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
+ return true;
+
+ return false;
+}
+
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
{
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
- u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
+ u8 flags = type_flags >> 8;
int err;
switch (type) {
+ case PACKET_FANOUT_ROLLOVER:
+ if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
+ return -EINVAL;
case PACKET_FANOUT_HASH:
case PACKET_FANOUT_LB:
case PACKET_FANOUT_CPU:
+ case PACKET_FANOUT_RND:
+ case PACKET_FANOUT_QM:
break;
default:
return -EINVAL;
@@ -1313,7 +1469,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
err = -EINVAL;
- if (match && match->defrag != defrag)
+ if (match && match->flags != flags)
goto out;
if (!match) {
err = -ENOMEM;
@@ -1323,7 +1479,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
write_pnet(&match->net, sock_net(sk));
match->id = id;
match->type = type;
- match->defrag = defrag;
+ match->flags = flags;
atomic_set(&match->rr_cur, 0);
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
@@ -1332,6 +1488,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->prot_hook.dev = po->prot_hook.dev;
match->prot_hook.func = packet_rcv_fanout;
match->prot_hook.af_packet_priv = match;
+ match->prot_hook.id_match = match_fanout_group;
dev_add_pack(&match->prot_hook);
list_add(&match->list, &fanout_list);
}
@@ -1362,9 +1519,9 @@ static void fanout_release(struct sock *sk)
if (!f)
return;
+ mutex_lock(&fanout_mutex);
po->fanout = NULL;
- mutex_lock(&fanout_mutex);
if (atomic_dec_and_test(&f->sk_ref)) {
list_del(&f->list);
dev_remove_pack(&f->prot_hook);
@@ -1453,7 +1610,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
- struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
struct sk_buff *skb = NULL;
struct net_device *dev;
__be16 proto = 0;
@@ -1476,7 +1633,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
* Find the device first to size check it
*/
- saddr->spkt_device[13] = 0;
+ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
retry:
rcu_read_lock();
dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
@@ -1552,13 +1709,14 @@ retry:
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
- if (err < 0)
- goto out_unlock;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
+ skb_probe_transport_header(skb, 0);
+
dev_queue_xmit(skb);
rcu_read_unlock();
return len;
@@ -1686,16 +1844,16 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
nf_reset(skb);
spin_lock(&sk->sk_receive_queue.lock);
- po->stats.tp_packets++;
+ po->stats.stats1.tp_packets++;
skb->dropcount = atomic_read(&sk->sk_drops);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
- sk->sk_data_ready(sk, skb->len);
+ sk->sk_data_ready(sk);
return 0;
drop_n_acct:
spin_lock(&sk->sk_receive_queue.lock);
- po->stats.tp_drops++;
+ po->stats.stats1.tp_drops++;
atomic_inc(&sk->sk_drops);
spin_unlock(&sk->sk_receive_queue.lock);
@@ -1715,21 +1873,22 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct sock *sk;
struct packet_sock *po;
struct sockaddr_ll *sll;
- union {
- struct tpacket_hdr *h1;
- struct tpacket2_hdr *h2;
- struct tpacket3_hdr *h3;
- void *raw;
- } h;
+ union tpacket_uhdr h;
u8 *skb_head = skb->data;
int skb_len = skb->len;
unsigned int snaplen, res;
unsigned long status = TP_STATUS_USER;
unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL;
- struct timeval tv;
struct timespec ts;
- struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+ __u32 ts_status;
+
+ /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
+ * We may add members to them until current aligned size without forcing
+ * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
+ */
+ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
+ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
if (skb->pkt_type == PACKET_LOOPBACK)
goto drop;
@@ -1801,10 +1960,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
- if (po->stats.tp_drops)
+ if (po->stats.stats1.tp_drops)
status |= TP_STATUS_LOSING;
}
- po->stats.tp_packets++;
+ po->stats.stats1.tp_packets++;
if (copy_skb) {
status |= TP_STATUS_COPY;
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
@@ -1813,24 +1972,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ getnstimeofday(&ts);
+
+ status |= ts_status;
+
switch (po->tp_version) {
case TPACKET_V1:
h.h1->tp_len = skb->len;
h.h1->tp_snaplen = snaplen;
h.h1->tp_mac = macoff;
h.h1->tp_net = netoff;
- if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
- && shhwtstamps->syststamp.tv64)
- tv = ktime_to_timeval(shhwtstamps->syststamp);
- else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
- && shhwtstamps->hwtstamp.tv64)
- tv = ktime_to_timeval(shhwtstamps->hwtstamp);
- else if (skb->tstamp.tv64)
- tv = ktime_to_timeval(skb->tstamp);
- else
- do_gettimeofday(&tv);
- h.h1->tp_sec = tv.tv_sec;
- h.h1->tp_usec = tv.tv_usec;
+ h.h1->tp_sec = ts.tv_sec;
+ h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
hdrlen = sizeof(*h.h1);
break;
case TPACKET_V2:
@@ -1838,25 +1992,17 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.h2->tp_snaplen = snaplen;
h.h2->tp_mac = macoff;
h.h2->tp_net = netoff;
- if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
- && shhwtstamps->syststamp.tv64)
- ts = ktime_to_timespec(shhwtstamps->syststamp);
- else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
- && shhwtstamps->hwtstamp.tv64)
- ts = ktime_to_timespec(shhwtstamps->hwtstamp);
- else if (skb->tstamp.tv64)
- ts = ktime_to_timespec(skb->tstamp);
- else
- getnstimeofday(&ts);
h.h2->tp_sec = ts.tv_sec;
h.h2->tp_nsec = ts.tv_nsec;
if (vlan_tx_tag_present(skb)) {
h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
- status |= TP_STATUS_VLAN_VALID;
+ h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
+ status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
} else {
h.h2->tp_vlan_tci = 0;
+ h.h2->tp_vlan_tpid = 0;
}
- h.h2->tp_padding = 0;
+ memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
hdrlen = sizeof(*h.h2);
break;
case TPACKET_V3:
@@ -1868,18 +2014,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.h3->tp_snaplen = snaplen;
h.h3->tp_mac = macoff;
h.h3->tp_net = netoff;
- if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
- && shhwtstamps->syststamp.tv64)
- ts = ktime_to_timespec(shhwtstamps->syststamp);
- else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
- && shhwtstamps->hwtstamp.tv64)
- ts = ktime_to_timespec(shhwtstamps->hwtstamp);
- else if (skb->tstamp.tv64)
- ts = ktime_to_timespec(skb->tstamp);
- else
- getnstimeofday(&ts);
h.h3->tp_sec = ts.tv_sec;
h.h3->tp_nsec = ts.tv_nsec;
+ memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
hdrlen = sizeof(*h.h3);
break;
default:
@@ -1898,25 +2035,26 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
sll->sll_ifindex = dev->ifindex;
smp_mb();
+
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
- {
+ if (po->tp_version <= TPACKET_V2) {
u8 *start, *end;
- if (po->tp_version <= TPACKET_V2) {
- end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
- + macoff + snaplen);
- for (start = h.raw; start < end; start += PAGE_SIZE)
- flush_dcache_page(pgv_to_page(start));
- }
- smp_wmb();
+ end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
+ macoff + snaplen);
+
+ for (start = h.raw; start < end; start += PAGE_SIZE)
+ flush_dcache_page(pgv_to_page(start));
}
+ smp_wmb();
#endif
+
if (po->tp_version <= TPACKET_V2)
__packet_set_status(po, h.raw, status);
else
prb_clear_blk_fill_status(&po->rx_ring);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -1928,10 +2066,10 @@ drop:
return 0;
ring_is_full:
- po->stats.tp_drops++;
+ po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
kfree_skb(copy_skb);
goto drop_n_restore;
}
@@ -1939,14 +2077,16 @@ ring_is_full:
static void tpacket_destruct_skb(struct sk_buff *skb)
{
struct packet_sock *po = pkt_sk(skb->sk);
- void *ph;
if (likely(po->tx_ring.pg_vec)) {
+ void *ph;
+ __u32 ts;
+
ph = skb_shinfo(skb)->destructor_arg;
- BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
- BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
- atomic_dec(&po->tx_ring.pending);
- __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
+ packet_dec_pending(&po->tx_ring);
+
+ ts = __packet_set_timestamp(po, ph, skb);
+ __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
}
sock_wfree(skb);
@@ -1956,11 +2096,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
void *frame, struct net_device *dev, int size_max,
__be16 proto, unsigned char *addr, int hlen)
{
- union {
- struct tpacket_hdr *h1;
- struct tpacket2_hdr *h2;
- void *raw;
- } ph;
+ union tpacket_uhdr ph;
int to_write, offset, len, tp_len, nr_frags, len_max;
struct socket *sock = po->sk.sk_socket;
struct page *page;
@@ -1973,6 +2109,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
+ sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
switch (po->tp_version) {
@@ -1991,7 +2128,37 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
- data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ if (!packet_use_direct_xmit(po))
+ skb_probe_transport_header(skb, 0);
+ if (unlikely(po->tp_tx_has_off)) {
+ int off_min, off_max, off;
+ off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ off_max = po->tx_ring.frame_size - tp_len;
+ if (sock->type == SOCK_DGRAM) {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_net;
+ break;
+ default:
+ off = ph.h1->tp_net;
+ break;
+ }
+ } else {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_mac;
+ break;
+ default:
+ off = ph.h1->tp_mac;
+ break;
+ }
+ }
+ if (unlikely((off < off_min) || (off_max < off)))
+ return -EINVAL;
+ data = ph.raw + off;
+ } else {
+ data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ }
to_write = tp_len;
if (sock->type == SOCK_DGRAM) {
@@ -2017,7 +2184,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
to_write -= dev->hard_header_len;
}
- err = -EFAULT;
offset = offset_in_page(data);
len_max = PAGE_SIZE - offset;
len = ((to_write > len_max) ? len_max : to_write);
@@ -2055,21 +2221,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
struct sk_buff *skb;
struct net_device *dev;
__be16 proto;
- bool need_rls_dev = false;
int err, reserve = 0;
void *ph;
- struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
+ bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
int tp_len, size_max;
unsigned char *addr;
int len_sum = 0;
- int status = 0;
+ int status = TP_STATUS_AVAILABLE;
int hlen, tlen;
mutex_lock(&po->pg_vec_lock);
- err = -EBUSY;
- if (saddr == NULL) {
- dev = po->prot_hook.dev;
+ if (likely(saddr == NULL)) {
+ dev = packet_cached_dev_get(po);
proto = po->num;
addr = NULL;
} else {
@@ -2083,19 +2248,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
proto = saddr->sll_protocol;
addr = saddr->sll_addr;
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
- need_rls_dev = true;
}
err = -ENXIO;
if (unlikely(dev == NULL))
goto out;
-
- reserve = dev->hard_header_len;
-
err = -ENETDOWN;
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
+ reserve = dev->hard_header_len + VLAN_HLEN;
size_max = po->tx_ring.frame_size
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
@@ -2104,10 +2266,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
do {
ph = packet_current_frame(po, &po->tx_ring,
- TP_STATUS_SEND_REQUEST);
-
+ TP_STATUS_SEND_REQUEST);
if (unlikely(ph == NULL)) {
- schedule();
+ if (need_wait && need_resched())
+ schedule();
continue;
}
@@ -2122,8 +2284,19 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
goto out_status;
tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
- addr, hlen);
+ addr, hlen);
+ if (tp_len > dev->mtu + dev->hard_header_len) {
+ struct ethhdr *ehdr;
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q))
+ tp_len = -EMSGSIZE;
+ }
if (unlikely(tp_len < 0)) {
if (po->tp_loss) {
__packet_set_status(po, ph,
@@ -2138,12 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
}
}
+ packet_pick_tx_queue(dev, skb);
+
skb->destructor = tpacket_destruct_skb;
__packet_set_status(po, ph, TP_STATUS_SENDING);
- atomic_inc(&po->tx_ring.pending);
+ packet_inc_pending(&po->tx_ring);
status = TP_STATUS_SEND_REQUEST;
- err = dev_queue_xmit(skb);
+ err = po->xmit(skb);
if (unlikely(err > 0)) {
err = net_xmit_errno(err);
if (err && __packet_get_status(po, ph) ==
@@ -2161,9 +2336,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
packet_increment_head(&po->tx_ring);
len_sum += tp_len;
} while (likely((ph != NULL) ||
- ((!(msg->msg_flags & MSG_DONTWAIT)) &&
- (atomic_read(&po->tx_ring.pending))))
- );
+ /* Note: packet_read_pending() might be slow if we have
+ * to call it as it's per_cpu variable, but in fast-path
+ * we already short-circuit the loop with the first
+ * condition, and luckily don't have to go that path
+ * anyway.
+ */
+ (need_wait && packet_read_pending(&po->tx_ring))));
err = len_sum;
goto out_put;
@@ -2172,8 +2351,7 @@ out_status:
__packet_set_status(po, ph, status);
kfree_skb(skb);
out_put:
- if (need_rls_dev)
- dev_put(dev);
+ dev_put(dev);
out:
mutex_unlock(&po->pg_vec_lock);
return err;
@@ -2191,7 +2369,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
linear = len;
skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
- err);
+ err, 0);
if (!skb)
return NULL;
@@ -2203,15 +2381,13 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
return skb;
}
-static int packet_snd(struct socket *sock,
- struct msghdr *msg, size_t len)
+static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
- struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
struct sk_buff *skb;
struct net_device *dev;
__be16 proto;
- bool need_rls_dev = false;
unsigned char *addr;
int err, reserve = 0;
struct virtio_net_hdr vnet_hdr = { 0 };
@@ -2226,8 +2402,8 @@ static int packet_snd(struct socket *sock,
* Get and verify the address.
*/
- if (saddr == NULL) {
- dev = po->prot_hook.dev;
+ if (likely(saddr == NULL)) {
+ dev = packet_cached_dev_get(po);
proto = po->num;
addr = NULL;
} else {
@@ -2239,19 +2415,17 @@ static int packet_snd(struct socket *sock,
proto = saddr->sll_protocol;
addr = saddr->sll_addr;
dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
- need_rls_dev = true;
}
err = -ENXIO;
- if (dev == NULL)
+ if (unlikely(dev == NULL))
goto out_unlock;
- if (sock->type == SOCK_RAW)
- reserve = dev->hard_header_len;
-
err = -ENETDOWN;
- if (!(dev->flags & IFF_UP))
+ if (unlikely(!(dev->flags & IFF_UP)))
goto out_unlock;
+ if (sock->type == SOCK_RAW)
+ reserve = dev->hard_header_len;
if (po->has_vnet_hdr) {
vnet_hdr_len = sizeof(vnet_hdr);
@@ -2331,9 +2505,8 @@ static int packet_snd(struct socket *sock,
err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
if (err)
goto out_free;
- err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
- if (err < 0)
- goto out_free;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
/* Earlier code assumed this would be a VLAN pkt,
@@ -2354,6 +2527,8 @@ static int packet_snd(struct socket *sock,
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ packet_pick_tx_queue(dev, skb);
+
if (po->has_vnet_hdr) {
if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
@@ -2373,26 +2548,23 @@ static int packet_snd(struct socket *sock,
len += vnet_hdr_len;
}
+ if (!packet_use_direct_xmit(po))
+ skb_probe_transport_header(skb, reserve);
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
- /*
- * Now send it
- */
-
- err = dev_queue_xmit(skb);
+ err = po->xmit(skb);
if (err > 0 && (err = net_xmit_errno(err)) != 0)
goto out_unlock;
- if (need_rls_dev)
- dev_put(dev);
+ dev_put(dev);
return len;
out_free:
kfree_skb(skb);
out_unlock:
- if (dev && need_rls_dev)
+ if (dev)
dev_put(dev);
out:
return err;
@@ -2403,6 +2575,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
+
if (po->tx_ring.pg_vec)
return tpacket_snd(po, msg);
else
@@ -2427,13 +2600,18 @@ static int packet_release(struct socket *sock)
net = sock_net(sk);
po = pkt_sk(sk);
- spin_lock_bh(&net->packet.sklist_lock);
+ mutex_lock(&net->packet.sklist_lock);
sk_del_node_init_rcu(sk);
+ mutex_unlock(&net->packet.sklist_lock);
+
+ preempt_disable();
sock_prot_inuse_add(net, sk->sk_prot, -1);
- spin_unlock_bh(&net->packet.sklist_lock);
+ preempt_enable();
spin_lock(&po->bind_lock);
unregister_prot_hook(sk, false);
+ packet_cached_dev_reset(po);
+
if (po->prot_hook.dev) {
dev_put(po->prot_hook.dev);
po->prot_hook.dev = NULL;
@@ -2442,13 +2620,15 @@ static int packet_release(struct socket *sock)
packet_flush_mclist(sk);
- memset(&req_u, 0, sizeof(req_u));
-
- if (po->rx_ring.pg_vec)
+ if (po->rx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 0);
+ }
- if (po->tx_ring.pg_vec)
+ if (po->tx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 1);
+ }
fanout_release(sk);
@@ -2462,6 +2642,7 @@ static int packet_release(struct socket *sock)
/* Purge queues */
skb_queue_purge(&sk->sk_receive_queue);
+ packet_free_pending(po);
sk_refcnt_debug_release(sk);
sock_put(sk);
@@ -2472,9 +2653,12 @@ static int packet_release(struct socket *sock)
* Attach a packet hook.
*/
-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
+static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
{
struct packet_sock *po = pkt_sk(sk);
+ const struct net_device *dev_curr;
+ __be16 proto_curr;
+ bool need_rehook;
if (po->fanout) {
if (dev)
@@ -2484,18 +2668,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
}
lock_sock(sk);
-
spin_lock(&po->bind_lock);
- unregister_prot_hook(sk, true);
- po->num = protocol;
- po->prot_hook.type = protocol;
- if (po->prot_hook.dev)
- dev_put(po->prot_hook.dev);
- po->prot_hook.dev = dev;
- po->ifindex = dev ? dev->ifindex : 0;
+ proto_curr = po->prot_hook.type;
+ dev_curr = po->prot_hook.dev;
+
+ need_rehook = proto_curr != proto || dev_curr != dev;
- if (protocol == 0)
+ if (need_rehook) {
+ unregister_prot_hook(sk, true);
+
+ po->num = proto;
+ po->prot_hook.type = proto;
+
+ if (po->prot_hook.dev)
+ dev_put(po->prot_hook.dev);
+
+ po->prot_hook.dev = dev;
+
+ po->ifindex = dev ? dev->ifindex : 0;
+ packet_cached_dev_assign(po, dev);
+ }
+
+ if (proto == 0 || !need_rehook)
goto out_unlock;
if (!dev || (dev->flags & IFF_UP)) {
@@ -2585,7 +2780,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
- if (!capable(CAP_NET_RAW))
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET)
@@ -2607,6 +2802,13 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
po = pkt_sk(sk);
sk->sk_family = PF_PACKET;
po->num = proto;
+ po->xmit = dev_queue_xmit;
+
+ err = packet_alloc_pending(po);
+ if (err)
+ goto out2;
+
+ packet_cached_dev_reset(po);
sk->sk_destruct = packet_sock_destruct;
sk_refcnt_debug_inc(sk);
@@ -2629,57 +2831,17 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
register_prot_hook(sk);
}
- spin_lock_bh(&net->packet.sklist_lock);
+ mutex_lock(&net->packet.sklist_lock);
sk_add_node_rcu(sk, &net->packet.sklist);
+ mutex_unlock(&net->packet.sklist_lock);
+
+ preempt_disable();
sock_prot_inuse_add(net, &packet_proto, 1);
- spin_unlock_bh(&net->packet.sklist_lock);
+ preempt_enable();
return 0;
-out:
- return err;
-}
-
-static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
-{
- struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
- int copied, err;
-
- err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
- if (skb == NULL)
- goto out;
-
- copied = skb->len;
- if (copied > len) {
- msg->msg_flags |= MSG_TRUNC;
- copied = len;
- }
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- if (err)
- goto out_free_skb;
-
- sock_recv_timestamp(msg, sk, skb);
-
- serr = SKB_EXT_ERR(skb);
- put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
- sizeof(serr->ee), &serr->ee);
-
- msg->msg_flags |= MSG_ERRQUEUE;
- err = copied;
-
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else
- spin_unlock_bh(&sk->sk_error_queue.lock);
-
-out_free_skb:
- kfree_skb(skb);
+out2:
+ sk_free(sk);
out:
return err;
}
@@ -2695,7 +2857,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
- struct sockaddr_ll *sll;
int vnet_hdr_len = 0;
err = -EINVAL;
@@ -2709,7 +2870,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
#endif
if (flags & MSG_ERRQUEUE) {
- err = packet_recv_error(sk, msg, len);
+ err = sock_recv_errqueue(sk, msg, len,
+ SOL_PACKET, PACKET_TX_TIMESTAMP);
goto out;
}
@@ -2778,22 +2940,10 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
goto out_free;
}
- /*
- * If the address length field is there to be filled in, we fill
- * it in now.
+ /* You lose any data beyond the buffer you gave. If it worries
+ * a user program they can ask the device for its MTU
+ * anyway.
*/
-
- sll = &PACKET_SKB_CB(skb)->sa.ll;
- if (sock->type == SOCK_PACKET)
- msg->msg_namelen = sizeof(struct sockaddr_pkt);
- else
- msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
-
- /*
- * You lose any data beyond the buffer you gave. If it worries a
- * user program they can ask the device for its MTU anyway.
- */
-
copied = skb->len;
if (copied > len) {
copied = len;
@@ -2806,9 +2956,21 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
sock_recv_ts_and_drops(msg, sk, skb);
- if (msg->msg_name)
+ if (msg->msg_name) {
+ /* If the address length field is there to be filled
+ * in, we fill it in now.
+ */
+ if (sock->type == SOCK_PACKET) {
+ __sockaddr_check_size(sizeof(struct sockaddr_pkt));
+ msg->msg_namelen = sizeof(struct sockaddr_pkt);
+ } else {
+ struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+ msg->msg_namelen = sll->sll_halen +
+ offsetof(struct sockaddr_ll, sll_addr);
+ }
memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
msg->msg_namelen);
+ }
if (pkt_sk(sk)->auxdata) {
struct tpacket_auxdata aux;
@@ -2822,11 +2984,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
aux.tp_net = skb_network_offset(skb);
if (vlan_tx_tag_present(skb)) {
aux.tp_vlan_tci = vlan_tx_tag_get(skb);
- aux.tp_status |= TP_STATUS_VLAN_VALID;
+ aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
+ aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
} else {
aux.tp_vlan_tci = 0;
+ aux.tp_vlan_tpid = 0;
}
- aux.tp_padding = 0;
put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
}
@@ -2852,12 +3015,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
return -EOPNOTSUPP;
uaddr->sa_family = AF_PACKET;
+ memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
if (dev)
- strncpy(uaddr->sa_data, dev->name, 14);
- else
- memset(uaddr->sa_data, 0, 14);
+ strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
rcu_read_unlock();
*uaddr_len = sizeof(*uaddr);
@@ -3215,6 +3377,31 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return fanout_add(sk, val & 0xffff, val >> 16);
}
+ case PACKET_TX_HAS_OFF:
+ {
+ unsigned int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ po->tp_tx_has_off = !!val;
+ return 0;
+ }
+ case PACKET_QDISC_BYPASS:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -3228,8 +3415,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
void *data = &val;
- struct tpacket_stats st;
- union tpacket_stats_u st_u;
+ union tpacket_stats_u st;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3243,22 +3429,20 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
switch (optname) {
case PACKET_STATISTICS:
spin_lock_bh(&sk->sk_receive_queue.lock);
+ memcpy(&st, &po->stats, sizeof(st));
+ memset(&po->stats, 0, sizeof(po->stats));
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
- memcpy(&st_u.stats3, &po->stats,
- sizeof(struct tpacket_stats));
- st_u.stats3.tp_freeze_q_cnt =
- po->stats_u.stats3.tp_freeze_q_cnt;
- st_u.stats3.tp_packets += po->stats.tp_drops;
- data = &st_u.stats3;
+ st.stats3.tp_packets += st.stats3.tp_drops;
+ data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
- st = po->stats;
- st.tp_packets += st.tp_drops;
- data = &st;
+ st.stats1.tp_packets += st.stats1.tp_drops;
+ data = &st.stats1;
}
- memset(&po->stats, 0, sizeof(st));
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+
break;
case PACKET_AUXDATA:
val = po->auxdata;
@@ -3303,9 +3487,16 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
case PACKET_FANOUT:
val = (po->fanout ?
((u32)po->fanout->id |
- ((u32)po->fanout->type << 16)) :
+ ((u32)po->fanout->type << 16) |
+ ((u32)po->fanout->flags << 24)) :
0);
break;
+ case PACKET_TX_HAS_OFF:
+ val = po->tp_tx_has_off;
+ break;
+ case PACKET_QDISC_BYPASS:
+ val = packet_use_direct_xmit(po);
+ break;
default:
return -ENOPROTOOPT;
}
@@ -3320,15 +3511,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
}
-static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
+static int packet_notifier(struct notifier_block *this,
+ unsigned long msg, void *ptr)
{
struct sock *sk;
- struct hlist_node *node;
- struct net_device *dev = data;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
rcu_read_lock();
- sk_for_each_rcu(sk, node, &net->packet.sklist) {
+ sk_for_each_rcu(sk, &net->packet.sklist) {
struct packet_sock *po = pkt_sk(sk);
switch (msg) {
@@ -3347,6 +3538,7 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
sk->sk_error_report(sk);
}
if (msg == NETDEV_UNREGISTER) {
+ packet_cached_dev_reset(po);
po->ifindex = -1;
if (po->prot_hook.dev)
dev_put(po->prot_hook.dev);
@@ -3496,34 +3688,26 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
static char *alloc_one_pg_vec_page(unsigned long order)
{
- char *buffer = NULL;
+ char *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
buffer = (char *) __get_free_pages(gfp_flags, order);
-
if (buffer)
return buffer;
- /*
- * __get_free_pages failed, fall back to vmalloc
- */
+ /* __get_free_pages failed, fall back to vmalloc */
buffer = vzalloc((1 << order) * PAGE_SIZE);
-
if (buffer)
return buffer;
- /*
- * vmalloc failed, lets dig into swap here
- */
+ /* vmalloc failed, lets dig into swap here */
gfp_flags &= ~__GFP_NORETRY;
- buffer = (char *)__get_free_pages(gfp_flags, order);
+ buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;
- /*
- * complete and utter failure
- */
+ /* complete and utter failure */
return NULL;
}
@@ -3578,7 +3762,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (!closing) {
if (atomic_read(&po->mapped))
goto out;
- if (atomic_read(&rb->pending))
+ if (packet_read_pending(rb))
goto out;
}
@@ -3630,7 +3814,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
*/
if (!tx_ring)
init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
- break;
+ break;
default:
break;
}
@@ -3853,7 +4037,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)
po->ifindex,
po->running,
atomic_read(&s->sk_rmem_alloc),
- sock_i_uid(s),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
sock_i_ino(s));
}
@@ -3885,10 +4069,10 @@ static const struct file_operations packet_seq_fops = {
static int __net_init packet_net_init(struct net *net)
{
- spin_lock_init(&net->packet.sklist_lock);
+ mutex_init(&net->packet.sklist_lock);
INIT_HLIST_HEAD(&net->packet.sklist);
- if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
+ if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
return -ENOMEM;
return 0;
@@ -3896,7 +4080,7 @@ static int __net_init packet_net_init(struct net *net)
static void __net_exit packet_net_exit(struct net *net)
{
- proc_net_remove(net, "packet");
+ remove_proc_entry("packet", net->proc_net);
}
static struct pernet_operations packet_net_ops = {
diff --git a/net/packet/diag.c b/net/packet/diag.c
new file mode 100644
index 00000000000..92f2c7107ee
--- /dev/null
+++ b/net/packet/diag.c
@@ -0,0 +1,264 @@
+#include <linux/module.h>
+#include <linux/sock_diag.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/packet_diag.h>
+#include <linux/percpu.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+
+static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
+{
+ struct packet_diag_info pinfo;
+
+ pinfo.pdi_index = po->ifindex;
+ pinfo.pdi_version = po->tp_version;
+ pinfo.pdi_reserve = po->tp_reserve;
+ pinfo.pdi_copy_thresh = po->copy_thresh;
+ pinfo.pdi_tstamp = po->tp_tstamp;
+
+ pinfo.pdi_flags = 0;
+ if (po->running)
+ pinfo.pdi_flags |= PDI_RUNNING;
+ if (po->auxdata)
+ pinfo.pdi_flags |= PDI_AUXDATA;
+ if (po->origdev)
+ pinfo.pdi_flags |= PDI_ORIGDEV;
+ if (po->has_vnet_hdr)
+ pinfo.pdi_flags |= PDI_VNETHDR;
+ if (po->tp_loss)
+ pinfo.pdi_flags |= PDI_LOSS;
+
+ return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo);
+}
+
+static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb)
+{
+ struct nlattr *mca;
+ struct packet_mclist *ml;
+
+ mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST);
+ if (!mca)
+ return -EMSGSIZE;
+
+ rtnl_lock();
+ for (ml = po->mclist; ml; ml = ml->next) {
+ struct packet_diag_mclist *dml;
+
+ dml = nla_reserve_nohdr(nlskb, sizeof(*dml));
+ if (!dml) {
+ rtnl_unlock();
+ nla_nest_cancel(nlskb, mca);
+ return -EMSGSIZE;
+ }
+
+ dml->pdmc_index = ml->ifindex;
+ dml->pdmc_type = ml->type;
+ dml->pdmc_alen = ml->alen;
+ dml->pdmc_count = ml->count;
+ BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr));
+ memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr));
+ }
+
+ rtnl_unlock();
+ nla_nest_end(nlskb, mca);
+
+ return 0;
+}
+
+static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
+ struct sk_buff *nlskb)
+{
+ struct packet_diag_ring pdr;
+
+ if (!ring->pg_vec || ((ver > TPACKET_V2) &&
+ (nl_type == PACKET_DIAG_TX_RING)))
+ return 0;
+
+ pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
+ pdr.pdr_block_nr = ring->pg_vec_len;
+ pdr.pdr_frame_size = ring->frame_size;
+ pdr.pdr_frame_nr = ring->frame_max + 1;
+
+ if (ver > TPACKET_V2) {
+ pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+ pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
+ pdr.pdr_features = ring->prb_bdqc.feature_req_word;
+ } else {
+ pdr.pdr_retire_tmo = 0;
+ pdr.pdr_sizeof_priv = 0;
+ pdr.pdr_features = 0;
+ }
+
+ return nla_put(nlskb, nl_type, sizeof(pdr), &pdr);
+}
+
+static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb)
+{
+ int ret;
+
+ mutex_lock(&po->pg_vec_lock);
+ ret = pdiag_put_ring(&po->rx_ring, po->tp_version,
+ PACKET_DIAG_RX_RING, skb);
+ if (!ret)
+ ret = pdiag_put_ring(&po->tx_ring, po->tp_version,
+ PACKET_DIAG_TX_RING, skb);
+ mutex_unlock(&po->pg_vec_lock);
+
+ return ret;
+}
+
+static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb)
+{
+ int ret = 0;
+
+ mutex_lock(&fanout_mutex);
+ if (po->fanout) {
+ u32 val;
+
+ val = (u32)po->fanout->id | ((u32)po->fanout->type << 16);
+ ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val);
+ }
+ mutex_unlock(&fanout_mutex);
+
+ return ret;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct packet_diag_req *req,
+ bool may_report_filterinfo,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct nlmsghdr *nlh;
+ struct packet_diag_msg *rp;
+ struct packet_sock *po = pkt_sk(sk);
+
+ nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rp = nlmsg_data(nlh);
+ rp->pdiag_family = AF_PACKET;
+ rp->pdiag_type = sk->sk_type;
+ rp->pdiag_num = ntohs(po->num);
+ rp->pdiag_ino = sk_ino;
+ sock_diag_save_cookie(sk, rp->pdiag_cookie);
+
+ if ((req->pdiag_show & PACKET_SHOW_INFO) &&
+ pdiag_put_info(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_INFO) &&
+ nla_put_u32(skb, PACKET_DIAG_UID,
+ from_kuid_munged(user_ns, sock_i_uid(sk))))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
+ pdiag_put_mclist(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_RING_CFG) &&
+ pdiag_put_rings_cfg(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_FANOUT) &&
+ pdiag_put_fanout(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_FILTER) &&
+ sock_diag_put_filterinfo(may_report_filterinfo, sk, skb,
+ PACKET_DIAG_FILTER))
+ goto out_nlmsg_trim;
+
+ return nlmsg_end(skb, nlh);
+
+out_nlmsg_trim:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int num = 0, s_num = cb->args[0];
+ struct packet_diag_req *req;
+ struct net *net;
+ struct sock *sk;
+ bool may_report_filterinfo;
+
+ net = sock_net(skb->sk);
+ req = nlmsg_data(cb->nlh);
+ may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+
+ mutex_lock(&net->packet.sklist_lock);
+ sk_for_each(sk, &net->packet.sklist) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+
+ if (sk_diag_fill(sk, skb, req,
+ may_report_filterinfo,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ sock_i_ino(sk)) < 0)
+ goto done;
+next:
+ num++;
+ }
+done:
+ mutex_unlock(&net->packet.sklist_lock);
+ cb->args[0] = num;
+
+ return skb->len;
+}
+
+static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct packet_diag_req);
+ struct net *net = sock_net(skb->sk);
+ struct packet_diag_req *req;
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ req = nlmsg_data(h);
+ /* Make it possible to support protocol filtering later */
+ if (req->sdiag_protocol)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = packet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ } else
+ return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler packet_diag_handler = {
+ .family = AF_PACKET,
+ .dump = packet_diag_handler_dump,
+};
+
+static int __init packet_diag_init(void)
+{
+ return sock_diag_register(&packet_diag_handler);
+}
+
+static void __exit packet_diag_exit(void)
+{
+ sock_diag_unregister(&packet_diag_handler);
+}
+
+module_init(packet_diag_init);
+module_exit(packet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);
diff --git a/net/packet/internal.h b/net/packet/internal.h
new file mode 100644
index 00000000000..eb9580a6b25
--- /dev/null
+++ b/net/packet/internal.h
@@ -0,0 +1,126 @@
+#ifndef __PACKET_INTERNAL_H__
+#define __PACKET_INTERNAL_H__
+
+struct packet_mclist {
+ struct packet_mclist *next;
+ int ifindex;
+ int count;
+ unsigned short type;
+ unsigned short alen;
+ unsigned char addr[MAX_ADDR_LEN];
+};
+
+/* kbdq - kernel block descriptor queue */
+struct tpacket_kbdq_core {
+ struct pgv *pkbdq;
+ unsigned int feature_req_word;
+ unsigned int hdrlen;
+ unsigned char reset_pending_on_curr_blk;
+ unsigned char delete_blk_timer;
+ unsigned short kactive_blk_num;
+ unsigned short blk_sizeof_priv;
+
+ /* last_kactive_blk_num:
+ * trick to see if user-space has caught up
+ * in order to avoid refreshing timer when every single pkt arrives.
+ */
+ unsigned short last_kactive_blk_num;
+
+ char *pkblk_start;
+ char *pkblk_end;
+ int kblk_size;
+ unsigned int knum_blocks;
+ uint64_t knxt_seq_num;
+ char *prev;
+ char *nxt_offset;
+ struct sk_buff *skb;
+
+ atomic_t blk_fill_in_prog;
+
+ /* Default is set to 8ms */
+#define DEFAULT_PRB_RETIRE_TOV (8)
+
+ unsigned short retire_blk_tov;
+ unsigned short version;
+ unsigned long tov_in_jiffies;
+
+ /* timer to retire an outstanding block */
+ struct timer_list retire_blk_timer;
+};
+
+struct pgv {
+ char *buffer;
+};
+
+struct packet_ring_buffer {
+ struct pgv *pg_vec;
+
+ unsigned int head;
+ unsigned int frames_per_block;
+ unsigned int frame_size;
+ unsigned int frame_max;
+
+ unsigned int pg_vec_order;
+ unsigned int pg_vec_pages;
+ unsigned int pg_vec_len;
+
+ unsigned int __percpu *pending_refcnt;
+
+ struct tpacket_kbdq_core prb_bdqc;
+};
+
+extern struct mutex fanout_mutex;
+#define PACKET_FANOUT_MAX 256
+
+struct packet_fanout {
+#ifdef CONFIG_NET_NS
+ struct net *net;
+#endif
+ unsigned int num_members;
+ u16 id;
+ u8 type;
+ u8 flags;
+ atomic_t rr_cur;
+ struct list_head list;
+ struct sock *arr[PACKET_FANOUT_MAX];
+ int next[PACKET_FANOUT_MAX];
+ spinlock_t lock;
+ atomic_t sk_ref;
+ struct packet_type prot_hook ____cacheline_aligned_in_smp;
+};
+
+struct packet_sock {
+ /* struct sock has to be the first member of packet_sock */
+ struct sock sk;
+ struct packet_fanout *fanout;
+ union tpacket_stats_u stats;
+ struct packet_ring_buffer rx_ring;
+ struct packet_ring_buffer tx_ring;
+ int copy_thresh;
+ spinlock_t bind_lock;
+ struct mutex pg_vec_lock;
+ unsigned int running:1, /* prot_hook is attached*/
+ auxdata:1,
+ origdev:1,
+ has_vnet_hdr:1;
+ int ifindex; /* bound device */
+ __be16 num;
+ struct packet_mclist *mclist;
+ atomic_t mapped;
+ enum tpacket_versions tp_version;
+ unsigned int tp_hdrlen;
+ unsigned int tp_reserve;
+ unsigned int tp_loss:1;
+ unsigned int tp_tx_has_off:1;
+ unsigned int tp_tstamp;
+ struct net_device __rcu *cached_dev;
+ int (*xmit)(struct sk_buff *skb);
+ struct packet_type prot_hook ____cacheline_aligned_in_smp;
+};
+
+static struct packet_sock *pkt_sk(struct sock *sk)
+{
+ return (struct packet_sock *)sk;
+}
+
+#endif