diff options
Diffstat (limited to 'net/packet')
| -rw-r--r-- | net/packet/Kconfig | 8 | ||||
| -rw-r--r-- | net/packet/Makefile | 2 | ||||
| -rw-r--r-- | net/packet/af_packet.c | 2191 | ||||
| -rw-r--r-- | net/packet/diag.c | 264 | ||||
| -rw-r--r-- | net/packet/internal.h | 126 | 
5 files changed, 2167 insertions, 424 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig index 0060e3b396b..cc55b35f80e 100644 --- a/net/packet/Kconfig +++ b/net/packet/Kconfig @@ -14,3 +14,11 @@ config PACKET  	  be called af_packet.  	  If unsure, say Y. + +config PACKET_DIAG +	tristate "Packet: sockets monitoring interface" +	depends on PACKET +	default n +	---help--- +	  Support for PF_PACKET sockets monitoring interface used by the ss tool. +	  If unsure, say Y. diff --git a/net/packet/Makefile b/net/packet/Makefile index 81183eabfde..9df61347a3c 100644 --- a/net/packet/Makefile +++ b/net/packet/Makefile @@ -3,3 +3,5 @@  #  obj-$(CONFIG_PACKET) += af_packet.o +obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o +af_packet_diag-y += diag.o diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 246a04a1323..b85c67ccb79 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -40,6 +40,10 @@   *					byte arrays at the end of sockaddr_ll   *					and packet_mreq.   *		Johann Baudy	:	Added TX RING. + *		Chetan Loke	:	Implemented TPACKET_V3 block abstraction + *					layer. + *					Copyright (C) 2011, <lokec@ccs.neu.edu> + *   *   *		This program is free software; you can redistribute it and/or   *		modify it under the terms of the GNU General Public License @@ -69,7 +73,6 @@  #include <net/sock.h>  #include <linux/errno.h>  #include <linux/timer.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <asm/ioctls.h>  #include <asm/page.h> @@ -85,11 +88,13 @@  #include <linux/virtio_net.h>  #include <linux/errqueue.h>  #include <linux/net_tstamp.h> - +#include <linux/percpu.h>  #ifdef CONFIG_INET  #include <net/inet_common.h>  #endif +#include "internal.h" +  /*     Assumptions:     - if device has no dev->hard_header routine, it adds and removes ll header @@ -143,14 +148,6 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it)  /* Private packet socket structures. */ -struct packet_mclist { -	struct packet_mclist	*next; -	int			ifindex; -	int			count; -	unsigned short		type; -	unsigned short		alen; -	unsigned char		addr[MAX_ADDR_LEN]; -};  /* identical to struct packet_mreq except it has   * a longer address field.   */ @@ -161,58 +158,63 @@ struct packet_mreq_max {  	unsigned char	mr_address[MAX_ADDR_LEN];  }; -static int packet_set_ring(struct sock *sk, struct tpacket_req *req, +union tpacket_uhdr { +	struct tpacket_hdr  *h1; +	struct tpacket2_hdr *h2; +	struct tpacket3_hdr *h3; +	void *raw; +}; + +static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,  		int closing, int tx_ring); -#define PGV_FROM_VMALLOC 1 -struct pgv { -	char *buffer; -}; +#define V3_ALIGNMENT	(8) -struct packet_ring_buffer { -	struct pgv		*pg_vec; -	unsigned int		head; -	unsigned int		frames_per_block; -	unsigned int		frame_size; -	unsigned int		frame_max; +#define BLK_HDR_LEN	(ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) -	unsigned int		pg_vec_order; -	unsigned int		pg_vec_pages; -	unsigned int		pg_vec_len; +#define BLK_PLUS_PRIV(sz_of_priv) \ +	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) -	atomic_t		pending; -}; +#define PGV_FROM_VMALLOC 1 + +#define BLOCK_STATUS(x)	((x)->hdr.bh1.block_status) +#define BLOCK_NUM_PKTS(x)	((x)->hdr.bh1.num_pkts) +#define BLOCK_O2FP(x)		((x)->hdr.bh1.offset_to_first_pkt) +#define BLOCK_LEN(x)		((x)->hdr.bh1.blk_len) +#define BLOCK_SNUM(x)		((x)->hdr.bh1.seq_num) +#define BLOCK_O2PRIV(x)	((x)->offset_to_priv) +#define BLOCK_PRIV(x)		((void *)((char *)(x) + BLOCK_O2PRIV(x)))  struct packet_sock;  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, +		       struct packet_type *pt, struct net_device *orig_dev); +static void *packet_previous_frame(struct packet_sock *po, +		struct packet_ring_buffer *rb, +		int status); +static void packet_increment_head(struct packet_ring_buffer *buff); +static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, +			struct tpacket_block_desc *); +static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, +			struct packet_sock *); +static void prb_retire_current_block(struct tpacket_kbdq_core *, +		struct packet_sock *, unsigned int status); +static int prb_queue_frozen(struct tpacket_kbdq_core *); +static void prb_open_block(struct tpacket_kbdq_core *, +		struct tpacket_block_desc *); +static void prb_retire_rx_blk_timer_expired(unsigned long); +static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static void prb_init_blk_timer(struct packet_sock *, +		struct tpacket_kbdq_core *, +		void (*func) (unsigned long)); +static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); +static void prb_clear_rxhash(struct tpacket_kbdq_core *, +		struct tpacket3_hdr *); +static void prb_fill_vlan_info(struct tpacket_kbdq_core *, +		struct tpacket3_hdr *);  static void packet_flush_mclist(struct sock *sk); -struct packet_sock { -	/* struct sock has to be the first member of packet_sock */ -	struct sock		sk; -	struct tpacket_stats	stats; -	struct packet_ring_buffer	rx_ring; -	struct packet_ring_buffer	tx_ring; -	int			copy_thresh; -	spinlock_t		bind_lock; -	struct mutex		pg_vec_lock; -	unsigned int		running:1,	/* prot_hook is attached*/ -				auxdata:1, -				origdev:1, -				has_vnet_hdr:1; -	int			ifindex;	/* bound device		*/ -	__be16			num; -	struct packet_mclist	*mclist; -	atomic_t		mapped; -	enum tpacket_versions	tp_version; -	unsigned int		tp_hdrlen; -	unsigned int		tp_reserve; -	unsigned int		tp_loss:1; -	unsigned int		tp_tstamp; -	struct packet_type	prot_hook ____cacheline_aligned_in_smp; -}; -  struct packet_skb_cb {  	unsigned int origlen;  	union { @@ -223,6 +225,165 @@ struct packet_skb_cb {  #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb)) +#define GET_PBDQC_FROM_RB(x)	((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) +#define GET_PBLOCK_DESC(x, bid)	\ +	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) +#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	\ +	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) +#define GET_NEXT_PRB_BLK_NUM(x) \ +	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ +	((x)->kactive_blk_num+1) : 0) + +static void __fanout_unlink(struct sock *sk, struct packet_sock *po); +static void __fanout_link(struct sock *sk, struct packet_sock *po); + +static int packet_direct_xmit(struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; +	const struct net_device_ops *ops = dev->netdev_ops; +	netdev_features_t features; +	struct netdev_queue *txq; +	int ret = NETDEV_TX_BUSY; +	u16 queue_map; + +	if (unlikely(!netif_running(dev) || +		     !netif_carrier_ok(dev))) +		goto drop; + +	features = netif_skb_features(skb); +	if (skb_needs_linearize(skb, features) && +	    __skb_linearize(skb)) +		goto drop; + +	queue_map = skb_get_queue_mapping(skb); +	txq = netdev_get_tx_queue(dev, queue_map); + +	local_bh_disable(); + +	HARD_TX_LOCK(dev, txq, smp_processor_id()); +	if (!netif_xmit_frozen_or_drv_stopped(txq)) { +		ret = ops->ndo_start_xmit(skb, dev); +		if (ret == NETDEV_TX_OK) +			txq_trans_update(txq); +	} +	HARD_TX_UNLOCK(dev, txq); + +	local_bh_enable(); + +	if (!dev_xmit_complete(ret)) +		kfree_skb(skb); + +	return ret; +drop: +	atomic_long_inc(&dev->tx_dropped); +	kfree_skb(skb); +	return NET_XMIT_DROP; +} + +static struct net_device *packet_cached_dev_get(struct packet_sock *po) +{ +	struct net_device *dev; + +	rcu_read_lock(); +	dev = rcu_dereference(po->cached_dev); +	if (likely(dev)) +		dev_hold(dev); +	rcu_read_unlock(); + +	return dev; +} + +static void packet_cached_dev_assign(struct packet_sock *po, +				     struct net_device *dev) +{ +	rcu_assign_pointer(po->cached_dev, dev); +} + +static void packet_cached_dev_reset(struct packet_sock *po) +{ +	RCU_INIT_POINTER(po->cached_dev, NULL); +} + +static bool packet_use_direct_xmit(const struct packet_sock *po) +{ +	return po->xmit == packet_direct_xmit; +} + +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +{ +	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; +} + +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +{ +	const struct net_device_ops *ops = dev->netdev_ops; +	u16 queue_index; + +	if (ops->ndo_select_queue) { +		queue_index = ops->ndo_select_queue(dev, skb, NULL, +						    __packet_pick_tx_queue); +		queue_index = netdev_cap_txqueue(dev, queue_index); +	} else { +		queue_index = __packet_pick_tx_queue(dev, skb); +	} + +	skb_set_queue_mapping(skb, queue_index); +} + +/* register_prot_hook must be invoked with the po->bind_lock held, + * or from a context in which asynchronous accesses to the packet + * socket is not possible (packet_create()). + */ +static void register_prot_hook(struct sock *sk) +{ +	struct packet_sock *po = pkt_sk(sk); + +	if (!po->running) { +		if (po->fanout) +			__fanout_link(sk, po); +		else +			dev_add_pack(&po->prot_hook); + +		sock_hold(sk); +		po->running = 1; +	} +} + +/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock + * held.   If the sync parameter is true, we will temporarily drop + * the po->bind_lock and do a synchronize_net to make sure no + * asynchronous packet processing paths still refer to the elements + * of po->prot_hook.  If the sync parameter is false, it is the + * callers responsibility to take care of this. + */ +static void __unregister_prot_hook(struct sock *sk, bool sync) +{ +	struct packet_sock *po = pkt_sk(sk); + +	po->running = 0; + +	if (po->fanout) +		__fanout_unlink(sk, po); +	else +		__dev_remove_pack(&po->prot_hook); + +	__sock_put(sk); + +	if (sync) { +		spin_unlock(&po->bind_lock); +		synchronize_net(); +		spin_lock(&po->bind_lock); +	} +} + +static void unregister_prot_hook(struct sock *sk, bool sync) +{ +	struct packet_sock *po = pkt_sk(sk); + +	if (po->running) +		__unregister_prot_hook(sk, sync); +} +  static inline __pure struct page *pgv_to_page(void *addr)  {  	if (is_vmalloc_addr(addr)) @@ -232,11 +393,7 @@ static inline __pure struct page *pgv_to_page(void *addr)  static void __packet_set_status(struct packet_sock *po, void *frame, int status)  { -	union { -		struct tpacket_hdr *h1; -		struct tpacket2_hdr *h2; -		void *raw; -	} h; +	union tpacket_uhdr h;  	h.raw = frame;  	switch (po->tp_version) { @@ -248,8 +405,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)  		h.h2->tp_status = status;  		flush_dcache_page(pgv_to_page(&h.h2->tp_status));  		break; +	case TPACKET_V3:  	default: -		pr_err("TPACKET version not supported\n"); +		WARN(1, "TPACKET version not supported.\n");  		BUG();  	} @@ -258,11 +416,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)  static int __packet_get_status(struct packet_sock *po, void *frame)  { -	union { -		struct tpacket_hdr *h1; -		struct tpacket2_hdr *h2; -		void *raw; -	} h; +	union tpacket_uhdr h;  	smp_rmb(); @@ -274,24 +428,74 @@ static int __packet_get_status(struct packet_sock *po, void *frame)  	case TPACKET_V2:  		flush_dcache_page(pgv_to_page(&h.h2->tp_status));  		return h.h2->tp_status; +	case TPACKET_V3:  	default: -		pr_err("TPACKET version not supported\n"); +		WARN(1, "TPACKET version not supported.\n");  		BUG();  		return 0;  	}  } +static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, +				   unsigned int flags) +{ +	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + +	if (shhwtstamps) { +		if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && +		    ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) +			return TP_STATUS_TS_SYS_HARDWARE; +		if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && +		    ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) +			return TP_STATUS_TS_RAW_HARDWARE; +	} + +	if (ktime_to_timespec_cond(skb->tstamp, ts)) +		return TP_STATUS_TS_SOFTWARE; + +	return 0; +} + +static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, +				    struct sk_buff *skb) +{ +	union tpacket_uhdr h; +	struct timespec ts; +	__u32 ts_status; + +	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) +		return 0; + +	h.raw = frame; +	switch (po->tp_version) { +	case TPACKET_V1: +		h.h1->tp_sec = ts.tv_sec; +		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; +		break; +	case TPACKET_V2: +		h.h2->tp_sec = ts.tv_sec; +		h.h2->tp_nsec = ts.tv_nsec; +		break; +	case TPACKET_V3: +	default: +		WARN(1, "TPACKET version not supported.\n"); +		BUG(); +	} + +	/* one flush is safe, as both fields always lie on the same cacheline */ +	flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); +	smp_wmb(); + +	return ts_status; +} +  static void *packet_lookup_frame(struct packet_sock *po,  		struct packet_ring_buffer *rb,  		unsigned int position,  		int status)  {  	unsigned int pg_vec_pos, frame_offset; -	union { -		struct tpacket_hdr *h1; -		struct tpacket2_hdr *h2; -		void *raw; -	} h; +	union tpacket_uhdr h;  	pg_vec_pos = position / rb->frames_per_block;  	frame_offset = position % rb->frames_per_block; @@ -305,14 +509,670 @@ static void *packet_lookup_frame(struct packet_sock *po,  	return h.raw;  } -static inline void *packet_current_frame(struct packet_sock *po, +static void *packet_current_frame(struct packet_sock *po,  		struct packet_ring_buffer *rb,  		int status)  {  	return packet_lookup_frame(po, rb, rb->head, status);  } -static inline void *packet_previous_frame(struct packet_sock *po, +static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) +{ +	del_timer_sync(&pkc->retire_blk_timer); +} + +static void prb_shutdown_retire_blk_timer(struct packet_sock *po, +		int tx_ring, +		struct sk_buff_head *rb_queue) +{ +	struct tpacket_kbdq_core *pkc; + +	pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : +			GET_PBDQC_FROM_RB(&po->rx_ring); + +	spin_lock_bh(&rb_queue->lock); +	pkc->delete_blk_timer = 1; +	spin_unlock_bh(&rb_queue->lock); + +	prb_del_retire_blk_timer(pkc); +} + +static void prb_init_blk_timer(struct packet_sock *po, +		struct tpacket_kbdq_core *pkc, +		void (*func) (unsigned long)) +{ +	init_timer(&pkc->retire_blk_timer); +	pkc->retire_blk_timer.data = (long)po; +	pkc->retire_blk_timer.function = func; +	pkc->retire_blk_timer.expires = jiffies; +} + +static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) +{ +	struct tpacket_kbdq_core *pkc; + +	if (tx_ring) +		BUG(); + +	pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : +			GET_PBDQC_FROM_RB(&po->rx_ring); +	prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); +} + +static int prb_calc_retire_blk_tmo(struct packet_sock *po, +				int blk_size_in_bytes) +{ +	struct net_device *dev; +	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; +	struct ethtool_cmd ecmd; +	int err; +	u32 speed; + +	rtnl_lock(); +	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); +	if (unlikely(!dev)) { +		rtnl_unlock(); +		return DEFAULT_PRB_RETIRE_TOV; +	} +	err = __ethtool_get_settings(dev, &ecmd); +	speed = ethtool_cmd_speed(&ecmd); +	rtnl_unlock(); +	if (!err) { +		/* +		 * If the link speed is so slow you don't really +		 * need to worry about perf anyways +		 */ +		if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { +			return DEFAULT_PRB_RETIRE_TOV; +		} else { +			msec = 1; +			div = speed / 1000; +		} +	} + +	mbits = (blk_size_in_bytes * 8) / (1024 * 1024); + +	if (div) +		mbits /= div; + +	tmo = mbits * msec; + +	if (div) +		return tmo+1; +	return tmo; +} + +static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, +			union tpacket_req_u *req_u) +{ +	p1->feature_req_word = req_u->req3.tp_feature_req_word; +} + +static void init_prb_bdqc(struct packet_sock *po, +			struct packet_ring_buffer *rb, +			struct pgv *pg_vec, +			union tpacket_req_u *req_u, int tx_ring) +{ +	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); +	struct tpacket_block_desc *pbd; + +	memset(p1, 0x0, sizeof(*p1)); + +	p1->knxt_seq_num = 1; +	p1->pkbdq = pg_vec; +	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; +	p1->pkblk_start	= pg_vec[0].buffer; +	p1->kblk_size = req_u->req3.tp_block_size; +	p1->knum_blocks	= req_u->req3.tp_block_nr; +	p1->hdrlen = po->tp_hdrlen; +	p1->version = po->tp_version; +	p1->last_kactive_blk_num = 0; +	po->stats.stats3.tp_freeze_q_cnt = 0; +	if (req_u->req3.tp_retire_blk_tov) +		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; +	else +		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, +						req_u->req3.tp_block_size); +	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); +	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + +	prb_init_ft_ops(p1, req_u); +	prb_setup_retire_blk_timer(po, tx_ring); +	prb_open_block(p1, pbd); +} + +/*  Do NOT update the last_blk_num first. + *  Assumes sk_buff_head lock is held. + */ +static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) +{ +	mod_timer(&pkc->retire_blk_timer, +			jiffies + pkc->tov_in_jiffies); +	pkc->last_kactive_blk_num = pkc->kactive_blk_num; +} + +/* + * Timer logic: + * 1) We refresh the timer only when we open a block. + *    By doing this we don't waste cycles refreshing the timer + *	  on packet-by-packet basis. + * + * With a 1MB block-size, on a 1Gbps line, it will take + * i) ~8 ms to fill a block + ii) memcpy etc. + * In this cut we are not accounting for the memcpy time. + * + * So, if the user sets the 'tmo' to 10ms then the timer + * will never fire while the block is still getting filled + * (which is what we want). However, the user could choose + * to close a block early and that's fine. + * + * But when the timer does fire, we check whether or not to refresh it. + * Since the tmo granularity is in msecs, it is not too expensive + * to refresh the timer, lets say every '8' msecs. + * Either the user can set the 'tmo' or we can derive it based on + * a) line-speed and b) block-size. + * prb_calc_retire_blk_tmo() calculates the tmo. + * + */ +static void prb_retire_rx_blk_timer_expired(unsigned long data) +{ +	struct packet_sock *po = (struct packet_sock *)data; +	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); +	unsigned int frozen; +	struct tpacket_block_desc *pbd; + +	spin_lock(&po->sk.sk_receive_queue.lock); + +	frozen = prb_queue_frozen(pkc); +	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); + +	if (unlikely(pkc->delete_blk_timer)) +		goto out; + +	/* We only need to plug the race when the block is partially filled. +	 * tpacket_rcv: +	 *		lock(); increment BLOCK_NUM_PKTS; unlock() +	 *		copy_bits() is in progress ... +	 *		timer fires on other cpu: +	 *		we can't retire the current block because copy_bits +	 *		is in progress. +	 * +	 */ +	if (BLOCK_NUM_PKTS(pbd)) { +		while (atomic_read(&pkc->blk_fill_in_prog)) { +			/* Waiting for skb_copy_bits to finish... */ +			cpu_relax(); +		} +	} + +	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { +		if (!frozen) { +			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); +			if (!prb_dispatch_next_block(pkc, po)) +				goto refresh_timer; +			else +				goto out; +		} else { +			/* Case 1. Queue was frozen because user-space was +			 *	   lagging behind. +			 */ +			if (prb_curr_blk_in_use(pkc, pbd)) { +				/* +				 * Ok, user-space is still behind. +				 * So just refresh the timer. +				 */ +				goto refresh_timer; +			} else { +			       /* Case 2. queue was frozen,user-space caught up, +				* now the link went idle && the timer fired. +				* We don't have a block to close.So we open this +				* block and restart the timer. +				* opening a block thaws the queue,restarts timer +				* Thawing/timer-refresh is a side effect. +				*/ +				prb_open_block(pkc, pbd); +				goto out; +			} +		} +	} + +refresh_timer: +	_prb_refresh_rx_retire_blk_timer(pkc); + +out: +	spin_unlock(&po->sk.sk_receive_queue.lock); +} + +static void prb_flush_block(struct tpacket_kbdq_core *pkc1, +		struct tpacket_block_desc *pbd1, __u32 status) +{ +	/* Flush everything minus the block header */ + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 +	u8 *start, *end; + +	start = (u8 *)pbd1; + +	/* Skip the block header(we know header WILL fit in 4K) */ +	start += PAGE_SIZE; + +	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); +	for (; start < end; start += PAGE_SIZE) +		flush_dcache_page(pgv_to_page(start)); + +	smp_wmb(); +#endif + +	/* Now update the block status. */ + +	BLOCK_STATUS(pbd1) = status; + +	/* Flush the block header */ + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 +	start = (u8 *)pbd1; +	flush_dcache_page(pgv_to_page(start)); + +	smp_wmb(); +#endif +} + +/* + * Side effect: + * + * 1) flush the block + * 2) Increment active_blk_num + * + * Note:We DONT refresh the timer on purpose. + *	Because almost always the next block will be opened. + */ +static void prb_close_block(struct tpacket_kbdq_core *pkc1, +		struct tpacket_block_desc *pbd1, +		struct packet_sock *po, unsigned int stat) +{ +	__u32 status = TP_STATUS_USER | stat; + +	struct tpacket3_hdr *last_pkt; +	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; + +	if (po->stats.stats3.tp_drops) +		status |= TP_STATUS_LOSING; + +	last_pkt = (struct tpacket3_hdr *)pkc1->prev; +	last_pkt->tp_next_offset = 0; + +	/* Get the ts of the last pkt */ +	if (BLOCK_NUM_PKTS(pbd1)) { +		h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; +		h1->ts_last_pkt.ts_nsec	= last_pkt->tp_nsec; +	} else { +		/* Ok, we tmo'd - so get the current time */ +		struct timespec ts; +		getnstimeofday(&ts); +		h1->ts_last_pkt.ts_sec = ts.tv_sec; +		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec; +	} + +	smp_wmb(); + +	/* Flush the block */ +	prb_flush_block(pkc1, pbd1, status); + +	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); +} + +static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) +{ +	pkc->reset_pending_on_curr_blk = 0; +} + +/* + * Side effect of opening a block: + * + * 1) prb_queue is thawed. + * 2) retire_blk_timer is refreshed. + * + */ +static void prb_open_block(struct tpacket_kbdq_core *pkc1, +	struct tpacket_block_desc *pbd1) +{ +	struct timespec ts; +	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; + +	smp_rmb(); + +	/* We could have just memset this but we will lose the +	 * flexibility of making the priv area sticky +	 */ + +	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; +	BLOCK_NUM_PKTS(pbd1) = 0; +	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); + +	getnstimeofday(&ts); + +	h1->ts_first_pkt.ts_sec = ts.tv_sec; +	h1->ts_first_pkt.ts_nsec = ts.tv_nsec; + +	pkc1->pkblk_start = (char *)pbd1; +	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); + +	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); +	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; + +	pbd1->version = pkc1->version; +	pkc1->prev = pkc1->nxt_offset; +	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; + +	prb_thaw_queue(pkc1); +	_prb_refresh_rx_retire_blk_timer(pkc1); + +	smp_wmb(); +} + +/* + * Queue freeze logic: + * 1) Assume tp_block_nr = 8 blocks. + * 2) At time 't0', user opens Rx ring. + * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 + * 4) user-space is either sleeping or processing block '0'. + * 5) tpacket_rcv is currently filling block '7', since there is no space left, + *    it will close block-7,loop around and try to fill block '0'. + *    call-flow: + *    __packet_lookup_frame_in_block + *      prb_retire_current_block() + *      prb_dispatch_next_block() + *        |->(BLOCK_STATUS == USER) evaluates to true + *    5.1) Since block-0 is currently in-use, we just freeze the queue. + * 6) Now there are two cases: + *    6.1) Link goes idle right after the queue is frozen. + *         But remember, the last open_block() refreshed the timer. + *         When this timer expires,it will refresh itself so that we can + *         re-open block-0 in near future. + *    6.2) Link is busy and keeps on receiving packets. This is a simple + *         case and __packet_lookup_frame_in_block will check if block-0 + *         is free and can now be re-used. + */ +static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, +				  struct packet_sock *po) +{ +	pkc->reset_pending_on_curr_blk = 1; +	po->stats.stats3.tp_freeze_q_cnt++; +} + +#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) + +/* + * If the next block is free then we will dispatch it + * and return a good offset. + * Else, we will freeze the queue. + * So, caller must check the return value. + */ +static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, +		struct packet_sock *po) +{ +	struct tpacket_block_desc *pbd; + +	smp_rmb(); + +	/* 1. Get current block num */ +	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); + +	/* 2. If this block is currently in_use then freeze the queue */ +	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { +		prb_freeze_queue(pkc, po); +		return NULL; +	} + +	/* +	 * 3. +	 * open this block and return the offset where the first packet +	 * needs to get stored. +	 */ +	prb_open_block(pkc, pbd); +	return (void *)pkc->nxt_offset; +} + +static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, +		struct packet_sock *po, unsigned int status) +{ +	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); + +	/* retire/close the current block */ +	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { +		/* +		 * Plug the case where copy_bits() is in progress on +		 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't +		 * have space to copy the pkt in the current block and +		 * called prb_retire_current_block() +		 * +		 * We don't need to worry about the TMO case because +		 * the timer-handler already handled this case. +		 */ +		if (!(status & TP_STATUS_BLK_TMO)) { +			while (atomic_read(&pkc->blk_fill_in_prog)) { +				/* Waiting for skb_copy_bits to finish... */ +				cpu_relax(); +			} +		} +		prb_close_block(pkc, pbd, po, status); +		return; +	} +} + +static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, +				      struct tpacket_block_desc *pbd) +{ +	return TP_STATUS_USER & BLOCK_STATUS(pbd); +} + +static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) +{ +	return pkc->reset_pending_on_curr_blk; +} + +static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) +{ +	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb); +	atomic_dec(&pkc->blk_fill_in_prog); +} + +static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, +			struct tpacket3_hdr *ppd) +{ +	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); +} + +static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, +			struct tpacket3_hdr *ppd) +{ +	ppd->hv1.tp_rxhash = 0; +} + +static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, +			struct tpacket3_hdr *ppd) +{ +	if (vlan_tx_tag_present(pkc->skb)) { +		ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); +		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); +		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; +	} else { +		ppd->hv1.tp_vlan_tci = 0; +		ppd->hv1.tp_vlan_tpid = 0; +		ppd->tp_status = TP_STATUS_AVAILABLE; +	} +} + +static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, +			struct tpacket3_hdr *ppd) +{ +	ppd->hv1.tp_padding = 0; +	prb_fill_vlan_info(pkc, ppd); + +	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) +		prb_fill_rxhash(pkc, ppd); +	else +		prb_clear_rxhash(pkc, ppd); +} + +static void prb_fill_curr_block(char *curr, +				struct tpacket_kbdq_core *pkc, +				struct tpacket_block_desc *pbd, +				unsigned int len) +{ +	struct tpacket3_hdr *ppd; + +	ppd  = (struct tpacket3_hdr *)curr; +	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); +	pkc->prev = curr; +	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); +	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); +	BLOCK_NUM_PKTS(pbd) += 1; +	atomic_inc(&pkc->blk_fill_in_prog); +	prb_run_all_ft_ops(pkc, ppd); +} + +/* Assumes caller has the sk->rx_queue.lock */ +static void *__packet_lookup_frame_in_block(struct packet_sock *po, +					    struct sk_buff *skb, +						int status, +					    unsigned int len +					    ) +{ +	struct tpacket_kbdq_core *pkc; +	struct tpacket_block_desc *pbd; +	char *curr, *end; + +	pkc = GET_PBDQC_FROM_RB(&po->rx_ring); +	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); + +	/* Queue is frozen when user space is lagging behind */ +	if (prb_queue_frozen(pkc)) { +		/* +		 * Check if that last block which caused the queue to freeze, +		 * is still in_use by user-space. +		 */ +		if (prb_curr_blk_in_use(pkc, pbd)) { +			/* Can't record this packet */ +			return NULL; +		} else { +			/* +			 * Ok, the block was released by user-space. +			 * Now let's open that block. +			 * opening a block also thaws the queue. +			 * Thawing is a side effect. +			 */ +			prb_open_block(pkc, pbd); +		} +	} + +	smp_mb(); +	curr = pkc->nxt_offset; +	pkc->skb = skb; +	end = (char *)pbd + pkc->kblk_size; + +	/* first try the current block */ +	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { +		prb_fill_curr_block(curr, pkc, pbd, len); +		return (void *)curr; +	} + +	/* Ok, close the current block */ +	prb_retire_current_block(pkc, po, 0); + +	/* Now, try to dispatch the next block */ +	curr = (char *)prb_dispatch_next_block(pkc, po); +	if (curr) { +		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); +		prb_fill_curr_block(curr, pkc, pbd, len); +		return (void *)curr; +	} + +	/* +	 * No free blocks are available.user_space hasn't caught up yet. +	 * Queue was just frozen and now this packet will get dropped. +	 */ +	return NULL; +} + +static void *packet_current_rx_frame(struct packet_sock *po, +					    struct sk_buff *skb, +					    int status, unsigned int len) +{ +	char *curr = NULL; +	switch (po->tp_version) { +	case TPACKET_V1: +	case TPACKET_V2: +		curr = packet_lookup_frame(po, &po->rx_ring, +					po->rx_ring.head, status); +		return curr; +	case TPACKET_V3: +		return __packet_lookup_frame_in_block(po, skb, status, len); +	default: +		WARN(1, "TPACKET version not supported\n"); +		BUG(); +		return NULL; +	} +} + +static void *prb_lookup_block(struct packet_sock *po, +				     struct packet_ring_buffer *rb, +				     unsigned int idx, +				     int status) +{ +	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb); +	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); + +	if (status != BLOCK_STATUS(pbd)) +		return NULL; +	return pbd; +} + +static int prb_previous_blk_num(struct packet_ring_buffer *rb) +{ +	unsigned int prev; +	if (rb->prb_bdqc.kactive_blk_num) +		prev = rb->prb_bdqc.kactive_blk_num-1; +	else +		prev = rb->prb_bdqc.knum_blocks-1; +	return prev; +} + +/* Assumes caller has held the rx_queue.lock */ +static void *__prb_previous_block(struct packet_sock *po, +					 struct packet_ring_buffer *rb, +					 int status) +{ +	unsigned int previous = prb_previous_blk_num(rb); +	return prb_lookup_block(po, rb, previous, status); +} + +static void *packet_previous_rx_frame(struct packet_sock *po, +					     struct packet_ring_buffer *rb, +					     int status) +{ +	if (po->tp_version <= TPACKET_V2) +		return packet_previous_frame(po, rb, status); + +	return __prb_previous_block(po, rb, status); +} + +static void packet_increment_rx_head(struct packet_sock *po, +					    struct packet_ring_buffer *rb) +{ +	switch (po->tp_version) { +	case TPACKET_V1: +	case TPACKET_V2: +		return packet_increment_head(rb); +	case TPACKET_V3: +	default: +		WARN(1, "TPACKET version not supported.\n"); +		BUG(); +		return; +	} +} + +static void *packet_previous_frame(struct packet_sock *po,  		struct packet_ring_buffer *rb,  		int status)  { @@ -320,14 +1180,73 @@ static inline void *packet_previous_frame(struct packet_sock *po,  	return packet_lookup_frame(po, rb, previous, status);  } -static inline void packet_increment_head(struct packet_ring_buffer *buff) +static void packet_increment_head(struct packet_ring_buffer *buff)  {  	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;  } -static inline struct packet_sock *pkt_sk(struct sock *sk) +static void packet_inc_pending(struct packet_ring_buffer *rb) +{ +	this_cpu_inc(*rb->pending_refcnt); +} + +static void packet_dec_pending(struct packet_ring_buffer *rb) +{ +	this_cpu_dec(*rb->pending_refcnt); +} + +static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)  { -	return (struct packet_sock *)sk; +	unsigned int refcnt = 0; +	int cpu; + +	/* We don't use pending refcount in rx_ring. */ +	if (rb->pending_refcnt == NULL) +		return 0; + +	for_each_possible_cpu(cpu) +		refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); + +	return refcnt; +} + +static int packet_alloc_pending(struct packet_sock *po) +{ +	po->rx_ring.pending_refcnt = NULL; + +	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); +	if (unlikely(po->tx_ring.pending_refcnt == NULL)) +		return -ENOBUFS; + +	return 0; +} + +static void packet_free_pending(struct packet_sock *po) +{ +	free_percpu(po->tx_ring.pending_refcnt); +} + +static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ +	struct sock *sk = &po->sk; +	bool has_room; + +	if (po->prot_hook.func != tpacket_rcv) +		return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) +			<= sk->sk_rcvbuf; + +	spin_lock(&sk->sk_receive_queue.lock); +	if (po->tp_version == TPACKET_V3) +		has_room = prb_lookup_block(po, &po->rx_ring, +					    po->rx_ring.prb_bdqc.kactive_blk_num, +					    TP_STATUS_KERNEL); +	else +		has_room = packet_lookup_frame(po, &po->rx_ring, +					       po->rx_ring.head, +					       TP_STATUS_KERNEL); +	spin_unlock(&sk->sk_receive_queue.lock); + +	return has_room;  }  static void packet_sock_destruct(struct sock *sk) @@ -345,6 +1264,271 @@ static void packet_sock_destruct(struct sock *sk)  	sk_refcnt_debug_dec(sk);  } +static int fanout_rr_next(struct packet_fanout *f, unsigned int num) +{ +	int x = atomic_read(&f->rr_cur) + 1; + +	if (x >= num) +		x = 0; + +	return x; +} + +static unsigned int fanout_demux_hash(struct packet_fanout *f, +				      struct sk_buff *skb, +				      unsigned int num) +{ +	return reciprocal_scale(skb_get_hash(skb), num); +} + +static unsigned int fanout_demux_lb(struct packet_fanout *f, +				    struct sk_buff *skb, +				    unsigned int num) +{ +	int cur, old; + +	cur = atomic_read(&f->rr_cur); +	while ((old = atomic_cmpxchg(&f->rr_cur, cur, +				     fanout_rr_next(f, num))) != cur) +		cur = old; +	return cur; +} + +static unsigned int fanout_demux_cpu(struct packet_fanout *f, +				     struct sk_buff *skb, +				     unsigned int num) +{ +	return smp_processor_id() % num; +} + +static unsigned int fanout_demux_rnd(struct packet_fanout *f, +				     struct sk_buff *skb, +				     unsigned int num) +{ +	return prandom_u32_max(num); +} + +static unsigned int fanout_demux_rollover(struct packet_fanout *f, +					  struct sk_buff *skb, +					  unsigned int idx, unsigned int skip, +					  unsigned int num) +{ +	unsigned int i, j; + +	i = j = min_t(int, f->next[idx], num - 1); +	do { +		if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { +			if (i != j) +				f->next[idx] = i; +			return i; +		} +		if (++i == num) +			i = 0; +	} while (i != j); + +	return idx; +} + +static unsigned int fanout_demux_qm(struct packet_fanout *f, +				    struct sk_buff *skb, +				    unsigned int num) +{ +	return skb_get_queue_mapping(skb) % num; +} + +static bool fanout_has_flag(struct packet_fanout *f, u16 flag) +{ +	return f->flags & (flag >> 8); +} + +static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, +			     struct packet_type *pt, struct net_device *orig_dev) +{ +	struct packet_fanout *f = pt->af_packet_priv; +	unsigned int num = f->num_members; +	struct packet_sock *po; +	unsigned int idx; + +	if (!net_eq(dev_net(dev), read_pnet(&f->net)) || +	    !num) { +		kfree_skb(skb); +		return 0; +	} + +	switch (f->type) { +	case PACKET_FANOUT_HASH: +	default: +		if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { +			skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); +			if (!skb) +				return 0; +		} +		idx = fanout_demux_hash(f, skb, num); +		break; +	case PACKET_FANOUT_LB: +		idx = fanout_demux_lb(f, skb, num); +		break; +	case PACKET_FANOUT_CPU: +		idx = fanout_demux_cpu(f, skb, num); +		break; +	case PACKET_FANOUT_RND: +		idx = fanout_demux_rnd(f, skb, num); +		break; +	case PACKET_FANOUT_QM: +		idx = fanout_demux_qm(f, skb, num); +		break; +	case PACKET_FANOUT_ROLLOVER: +		idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); +		break; +	} + +	po = pkt_sk(f->arr[idx]); +	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && +	    unlikely(!packet_rcv_has_room(po, skb))) { +		idx = fanout_demux_rollover(f, skb, idx, idx, num); +		po = pkt_sk(f->arr[idx]); +	} + +	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); +} + +DEFINE_MUTEX(fanout_mutex); +EXPORT_SYMBOL_GPL(fanout_mutex); +static LIST_HEAD(fanout_list); + +static void __fanout_link(struct sock *sk, struct packet_sock *po) +{ +	struct packet_fanout *f = po->fanout; + +	spin_lock(&f->lock); +	f->arr[f->num_members] = sk; +	smp_wmb(); +	f->num_members++; +	spin_unlock(&f->lock); +} + +static void __fanout_unlink(struct sock *sk, struct packet_sock *po) +{ +	struct packet_fanout *f = po->fanout; +	int i; + +	spin_lock(&f->lock); +	for (i = 0; i < f->num_members; i++) { +		if (f->arr[i] == sk) +			break; +	} +	BUG_ON(i >= f->num_members); +	f->arr[i] = f->arr[f->num_members - 1]; +	f->num_members--; +	spin_unlock(&f->lock); +} + +static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) +{ +	if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) +		return true; + +	return false; +} + +static int fanout_add(struct sock *sk, u16 id, u16 type_flags) +{ +	struct packet_sock *po = pkt_sk(sk); +	struct packet_fanout *f, *match; +	u8 type = type_flags & 0xff; +	u8 flags = type_flags >> 8; +	int err; + +	switch (type) { +	case PACKET_FANOUT_ROLLOVER: +		if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) +			return -EINVAL; +	case PACKET_FANOUT_HASH: +	case PACKET_FANOUT_LB: +	case PACKET_FANOUT_CPU: +	case PACKET_FANOUT_RND: +	case PACKET_FANOUT_QM: +		break; +	default: +		return -EINVAL; +	} + +	if (!po->running) +		return -EINVAL; + +	if (po->fanout) +		return -EALREADY; + +	mutex_lock(&fanout_mutex); +	match = NULL; +	list_for_each_entry(f, &fanout_list, list) { +		if (f->id == id && +		    read_pnet(&f->net) == sock_net(sk)) { +			match = f; +			break; +		} +	} +	err = -EINVAL; +	if (match && match->flags != flags) +		goto out; +	if (!match) { +		err = -ENOMEM; +		match = kzalloc(sizeof(*match), GFP_KERNEL); +		if (!match) +			goto out; +		write_pnet(&match->net, sock_net(sk)); +		match->id = id; +		match->type = type; +		match->flags = flags; +		atomic_set(&match->rr_cur, 0); +		INIT_LIST_HEAD(&match->list); +		spin_lock_init(&match->lock); +		atomic_set(&match->sk_ref, 0); +		match->prot_hook.type = po->prot_hook.type; +		match->prot_hook.dev = po->prot_hook.dev; +		match->prot_hook.func = packet_rcv_fanout; +		match->prot_hook.af_packet_priv = match; +		match->prot_hook.id_match = match_fanout_group; +		dev_add_pack(&match->prot_hook); +		list_add(&match->list, &fanout_list); +	} +	err = -EINVAL; +	if (match->type == type && +	    match->prot_hook.type == po->prot_hook.type && +	    match->prot_hook.dev == po->prot_hook.dev) { +		err = -ENOSPC; +		if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { +			__dev_remove_pack(&po->prot_hook); +			po->fanout = match; +			atomic_inc(&match->sk_ref); +			__fanout_link(sk, po); +			err = 0; +		} +	} +out: +	mutex_unlock(&fanout_mutex); +	return err; +} + +static void fanout_release(struct sock *sk) +{ +	struct packet_sock *po = pkt_sk(sk); +	struct packet_fanout *f; + +	f = po->fanout; +	if (!f) +		return; + +	mutex_lock(&fanout_mutex); +	po->fanout = NULL; + +	if (atomic_dec_and_test(&f->sk_ref)) { +		list_del(&f->list); +		dev_remove_pack(&f->prot_hook); +		kfree(f); +	} +	mutex_unlock(&fanout_mutex); +}  static const struct proto_ops packet_ops; @@ -426,11 +1610,12 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,  			       struct msghdr *msg, size_t len)  {  	struct sock *sk = sock->sk; -	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);  	struct sk_buff *skb = NULL;  	struct net_device *dev;  	__be16 proto = 0;  	int err; +	int extra_len = 0;  	/*  	 *	Get and verify the address. @@ -448,7 +1633,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,  	 *	Find the device first to size check it  	 */ -	saddr->spkt_device[13] = 0; +	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;  retry:  	rcu_read_lock();  	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); @@ -465,16 +1650,25 @@ retry:  	 * raw protocol and you must do your own fragmentation at this level.  	 */ +	if (unlikely(sock_flag(sk, SOCK_NOFCS))) { +		if (!netif_supports_nofcs(dev)) { +			err = -EPROTONOSUPPORT; +			goto out_unlock; +		} +		extra_len = 4; /* We're doing our own CRC */ +	} +  	err = -EMSGSIZE; -	if (len > dev->mtu + dev->hard_header_len) +	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)  		goto out_unlock;  	if (!skb) {  		size_t reserved = LL_RESERVED_SPACE(dev); +		int tlen = dev->needed_tailroom;  		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;  		rcu_read_unlock(); -		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL); +		skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);  		if (skb == NULL)  			return -ENOBUFS;  		/* FIXME: Save some space for broken drivers that write a hard @@ -497,14 +1691,31 @@ retry:  		goto retry;  	} +	if (len > (dev->mtu + dev->hard_header_len + extra_len)) { +		/* Earlier code assumed this would be a VLAN pkt, +		 * double-check this now that we have the actual +		 * packet in hand. +		 */ +		struct ethhdr *ehdr; +		skb_reset_mac_header(skb); +		ehdr = eth_hdr(skb); +		if (ehdr->h_proto != htons(ETH_P_8021Q)) { +			err = -EMSGSIZE; +			goto out_unlock; +		} +	}  	skb->protocol = proto;  	skb->dev = dev;  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; -	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); -	if (err < 0) -		goto out_unlock; + +	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + +	if (unlikely(extra_len == 4)) +		skb->no_fcs = 1; + +	skb_probe_transport_header(skb, 0);  	dev_queue_xmit(skb);  	rcu_read_unlock(); @@ -517,17 +1728,17 @@ out_free:  	return err;  } -static inline unsigned int run_filter(const struct sk_buff *skb, +static unsigned int run_filter(const struct sk_buff *skb,  				      const struct sock *sk,  				      unsigned int res)  {  	struct sk_filter *filter; -	rcu_read_lock_bh(); -	filter = rcu_dereference_bh(sk->sk_filter); +	rcu_read_lock(); +	filter = rcu_dereference(sk->sk_filter);  	if (filter != NULL) -		res = sk_run_filter(skb, filter->insns); -	rcu_read_unlock_bh(); +		res = SK_RUN_FILTER(filter, skb); +	rcu_read_unlock();  	return res;  } @@ -589,8 +1800,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  	if (snaplen > res)  		snaplen = res; -	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -	    (unsigned)sk->sk_rcvbuf) +	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)  		goto drop_n_acct;  	if (skb_shared(skb)) { @@ -602,7 +1812,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  			skb->data = skb_head;  			skb->len = skb_len;  		} -		kfree_skb(skb); +		consume_skb(skb);  		skb = nskb;  	} @@ -634,15 +1844,18 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  	nf_reset(skb);  	spin_lock(&sk->sk_receive_queue.lock); -	po->stats.tp_packets++; +	po->stats.stats1.tp_packets++;  	skb->dropcount = atomic_read(&sk->sk_drops);  	__skb_queue_tail(&sk->sk_receive_queue, skb);  	spin_unlock(&sk->sk_receive_queue.lock); -	sk->sk_data_ready(sk, skb->len); +	sk->sk_data_ready(sk);  	return 0;  drop_n_acct: -	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); +	spin_lock(&sk->sk_receive_queue.lock); +	po->stats.stats1.tp_drops++; +	atomic_inc(&sk->sk_drops); +	spin_unlock(&sk->sk_receive_queue.lock);  drop_n_restore:  	if (skb_head != skb->data && skb_shared(skb)) { @@ -660,20 +1873,22 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  	struct sock *sk;  	struct packet_sock *po;  	struct sockaddr_ll *sll; -	union { -		struct tpacket_hdr *h1; -		struct tpacket2_hdr *h2; -		void *raw; -	} h; +	union tpacket_uhdr h;  	u8 *skb_head = skb->data;  	int skb_len = skb->len;  	unsigned int snaplen, res; -	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; +	unsigned long status = TP_STATUS_USER;  	unsigned short macoff, netoff, hdrlen;  	struct sk_buff *copy_skb = NULL; -	struct timeval tv;  	struct timespec ts; -	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); +	__u32 ts_status; + +	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. +	 * We may add members to them until current aligned size without forcing +	 * userspace to call getsockopt(..., PACKET_HDRLEN, ...). +	 */ +	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); +	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);  	if (skb->pkt_type == PACKET_LOOPBACK)  		goto drop; @@ -708,65 +1923,68 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +  				  po->tp_reserve;  	} else { -		unsigned maclen = skb_network_offset(skb); +		unsigned int maclen = skb_network_offset(skb);  		netoff = TPACKET_ALIGN(po->tp_hdrlen +  				       (maclen < 16 ? 16 : maclen)) +  			po->tp_reserve;  		macoff = netoff - maclen;  	} - -	if (macoff + snaplen > po->rx_ring.frame_size) { -		if (po->copy_thresh && -		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize < -		    (unsigned)sk->sk_rcvbuf) { -			if (skb_shared(skb)) { -				copy_skb = skb_clone(skb, GFP_ATOMIC); -			} else { -				copy_skb = skb_get(skb); -				skb_head = skb->data; +	if (po->tp_version <= TPACKET_V2) { +		if (macoff + snaplen > po->rx_ring.frame_size) { +			if (po->copy_thresh && +			    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { +				if (skb_shared(skb)) { +					copy_skb = skb_clone(skb, GFP_ATOMIC); +				} else { +					copy_skb = skb_get(skb); +					skb_head = skb->data; +				} +				if (copy_skb) +					skb_set_owner_r(copy_skb, sk);  			} -			if (copy_skb) -				skb_set_owner_r(copy_skb, sk); +			snaplen = po->rx_ring.frame_size - macoff; +			if ((int)snaplen < 0) +				snaplen = 0;  		} -		snaplen = po->rx_ring.frame_size - macoff; -		if ((int)snaplen < 0) -			snaplen = 0;  	} -  	spin_lock(&sk->sk_receive_queue.lock); -	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); +	h.raw = packet_current_rx_frame(po, skb, +					TP_STATUS_KERNEL, (macoff+snaplen));  	if (!h.raw)  		goto ring_is_full; -	packet_increment_head(&po->rx_ring); -	po->stats.tp_packets++; +	if (po->tp_version <= TPACKET_V2) { +		packet_increment_rx_head(po, &po->rx_ring); +	/* +	 * LOSING will be reported till you read the stats, +	 * because it's COR - Clear On Read. +	 * Anyways, moving it for V1/V2 only as V3 doesn't need this +	 * at packet level. +	 */ +		if (po->stats.stats1.tp_drops) +			status |= TP_STATUS_LOSING; +	} +	po->stats.stats1.tp_packets++;  	if (copy_skb) {  		status |= TP_STATUS_COPY;  		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);  	} -	if (!po->stats.tp_drops) -		status &= ~TP_STATUS_LOSING;  	spin_unlock(&sk->sk_receive_queue.lock);  	skb_copy_bits(skb, 0, h.raw + macoff, snaplen); +	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) +		getnstimeofday(&ts); + +	status |= ts_status; +  	switch (po->tp_version) {  	case TPACKET_V1:  		h.h1->tp_len = skb->len;  		h.h1->tp_snaplen = snaplen;  		h.h1->tp_mac = macoff;  		h.h1->tp_net = netoff; -		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) -				&& shhwtstamps->syststamp.tv64) -			tv = ktime_to_timeval(shhwtstamps->syststamp); -		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) -				&& shhwtstamps->hwtstamp.tv64) -			tv = ktime_to_timeval(shhwtstamps->hwtstamp); -		else if (skb->tstamp.tv64) -			tv = ktime_to_timeval(skb->tstamp); -		else -			do_gettimeofday(&tv); -		h.h1->tp_sec = tv.tv_sec; -		h.h1->tp_usec = tv.tv_usec; +		h.h1->tp_sec = ts.tv_sec; +		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;  		hdrlen = sizeof(*h.h1);  		break;  	case TPACKET_V2: @@ -774,21 +1992,33 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  		h.h2->tp_snaplen = snaplen;  		h.h2->tp_mac = macoff;  		h.h2->tp_net = netoff; -		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) -				&& shhwtstamps->syststamp.tv64) -			ts = ktime_to_timespec(shhwtstamps->syststamp); -		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) -				&& shhwtstamps->hwtstamp.tv64) -			ts = ktime_to_timespec(shhwtstamps->hwtstamp); -		else if (skb->tstamp.tv64) -			ts = ktime_to_timespec(skb->tstamp); -		else -			getnstimeofday(&ts);  		h.h2->tp_sec = ts.tv_sec;  		h.h2->tp_nsec = ts.tv_nsec; -		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); +		if (vlan_tx_tag_present(skb)) { +			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); +			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); +			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; +		} else { +			h.h2->tp_vlan_tci = 0; +			h.h2->tp_vlan_tpid = 0; +		} +		memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));  		hdrlen = sizeof(*h.h2);  		break; +	case TPACKET_V3: +		/* tp_nxt_offset,vlan are already populated above. +		 * So DONT clear those fields here +		 */ +		h.h3->tp_status |= status; +		h.h3->tp_len = skb->len; +		h.h3->tp_snaplen = snaplen; +		h.h3->tp_mac = macoff; +		h.h3->tp_net = netoff; +		h.h3->tp_sec  = ts.tv_sec; +		h.h3->tp_nsec = ts.tv_nsec; +		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); +		hdrlen = sizeof(*h.h3); +		break;  	default:  		BUG();  	} @@ -804,19 +2034,27 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  	else  		sll->sll_ifindex = dev->ifindex; -	__packet_set_status(po, h.raw, status);  	smp_mb(); +  #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 -	{ +	if (po->tp_version <= TPACKET_V2) {  		u8 *start, *end; -		end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); +		end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + +					macoff + snaplen); +  		for (start = h.raw; start < end; start += PAGE_SIZE)  			flush_dcache_page(pgv_to_page(start));  	} +	smp_wmb();  #endif -	sk->sk_data_ready(sk, 0); +	if (po->tp_version <= TPACKET_V2) +		__packet_set_status(po, h.raw, status); +	else +		prb_clear_blk_fill_status(&po->rx_ring); + +	sk->sk_data_ready(sk);  drop_n_restore:  	if (skb_head != skb->data && skb_shared(skb)) { @@ -828,10 +2066,10 @@ drop:  	return 0;  ring_is_full: -	po->stats.tp_drops++; +	po->stats.stats1.tp_drops++;  	spin_unlock(&sk->sk_receive_queue.lock); -	sk->sk_data_ready(sk, 0); +	sk->sk_data_ready(sk);  	kfree_skb(copy_skb);  	goto drop_n_restore;  } @@ -839,16 +2077,16 @@ ring_is_full:  static void tpacket_destruct_skb(struct sk_buff *skb)  {  	struct packet_sock *po = pkt_sk(skb->sk); -	void *ph; - -	BUG_ON(skb == NULL);  	if (likely(po->tx_ring.pg_vec)) { +		void *ph; +		__u32 ts; +  		ph = skb_shinfo(skb)->destructor_arg; -		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); -		BUG_ON(atomic_read(&po->tx_ring.pending) == 0); -		atomic_dec(&po->tx_ring.pending); -		__packet_set_status(po, ph, TP_STATUS_AVAILABLE); +		packet_dec_pending(&po->tx_ring); + +		ts = __packet_set_timestamp(po, ph, skb); +		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);  	}  	sock_wfree(skb); @@ -856,13 +2094,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)  static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,  		void *frame, struct net_device *dev, int size_max, -		__be16 proto, unsigned char *addr) +		__be16 proto, unsigned char *addr, int hlen)  { -	union { -		struct tpacket_hdr *h1; -		struct tpacket2_hdr *h2; -		void *raw; -	} ph; +	union tpacket_uhdr ph;  	int to_write, offset, len, tp_len, nr_frags, len_max;  	struct socket *sock = po->sk.sk_socket;  	struct page *page; @@ -875,6 +2109,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,  	skb->dev = dev;  	skb->priority = po->sk.sk_priority;  	skb->mark = po->sk.sk_mark; +	sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);  	skb_shinfo(skb)->destructor_arg = ph.raw;  	switch (po->tp_version) { @@ -890,10 +2125,40 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,  		return -EMSGSIZE;  	} -	skb_reserve(skb, LL_RESERVED_SPACE(dev)); +	skb_reserve(skb, hlen);  	skb_reset_network_header(skb); -	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); +	if (!packet_use_direct_xmit(po)) +		skb_probe_transport_header(skb, 0); +	if (unlikely(po->tp_tx_has_off)) { +		int off_min, off_max, off; +		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); +		off_max = po->tx_ring.frame_size - tp_len; +		if (sock->type == SOCK_DGRAM) { +			switch (po->tp_version) { +			case TPACKET_V2: +				off = ph.h2->tp_net; +				break; +			default: +				off = ph.h1->tp_net; +				break; +			} +		} else { +			switch (po->tp_version) { +			case TPACKET_V2: +				off = ph.h2->tp_mac; +				break; +			default: +				off = ph.h1->tp_mac; +				break; +			} +		} +		if (unlikely((off < off_min) || (off_max < off))) +			return -EINVAL; +		data = ph.raw + off; +	} else { +		data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); +	}  	to_write = tp_len;  	if (sock->type == SOCK_DGRAM) { @@ -919,7 +2184,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,  		to_write -= dev->hard_header_len;  	} -	err = -EFAULT;  	offset = offset_in_page(data);  	len_max = PAGE_SIZE - offset;  	len = ((to_write > len_max) ? len_max : to_write); @@ -954,25 +2218,23 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)  { -	struct socket *sock;  	struct sk_buff *skb;  	struct net_device *dev;  	__be16 proto; -	int ifindex, err, reserve = 0; +	int err, reserve = 0;  	void *ph; -	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); +	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);  	int tp_len, size_max;  	unsigned char *addr;  	int len_sum = 0; -	int status = 0; - -	sock = po->sk.sk_socket; +	int status = TP_STATUS_AVAILABLE; +	int hlen, tlen;  	mutex_lock(&po->pg_vec_lock); -	err = -EBUSY; -	if (saddr == NULL) { -		ifindex	= po->ifindex; +	if (likely(saddr == NULL)) { +		dev	= packet_cached_dev_get(po);  		proto	= po->num;  		addr	= NULL;  	} else { @@ -983,22 +2245,19 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)  					+ offsetof(struct sockaddr_ll,  						sll_addr)))  			goto out; -		ifindex	= saddr->sll_ifindex;  		proto	= saddr->sll_protocol;  		addr	= saddr->sll_addr; +		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);  	} -	dev = dev_get_by_index(sock_net(&po->sk), ifindex);  	err = -ENXIO;  	if (unlikely(dev == NULL))  		goto out; - -	reserve = dev->hard_header_len; -  	err = -ENETDOWN;  	if (unlikely(!(dev->flags & IFF_UP)))  		goto out_put; +	reserve = dev->hard_header_len + VLAN_HLEN;  	size_max = po->tx_ring.frame_size  		- (po->tp_hdrlen - sizeof(struct sockaddr_ll)); @@ -1007,25 +2266,37 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)  	do {  		ph = packet_current_frame(po, &po->tx_ring, -				TP_STATUS_SEND_REQUEST); - +					  TP_STATUS_SEND_REQUEST);  		if (unlikely(ph == NULL)) { -			schedule(); +			if (need_wait && need_resched()) +				schedule();  			continue;  		}  		status = TP_STATUS_SEND_REQUEST; +		hlen = LL_RESERVED_SPACE(dev); +		tlen = dev->needed_tailroom;  		skb = sock_alloc_send_skb(&po->sk, -				LL_ALLOCATED_SPACE(dev) -				+ sizeof(struct sockaddr_ll), +				hlen + tlen + sizeof(struct sockaddr_ll),  				0, &err);  		if (unlikely(skb == NULL))  			goto out_status;  		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, -				addr); +					  addr, hlen); +		if (tp_len > dev->mtu + dev->hard_header_len) { +			struct ethhdr *ehdr; +			/* Earlier code assumed this would be a VLAN pkt, +			 * double-check this now that we have the actual +			 * packet in hand. +			 */ +			skb_reset_mac_header(skb); +			ehdr = eth_hdr(skb); +			if (ehdr->h_proto != htons(ETH_P_8021Q)) +				tp_len = -EMSGSIZE; +		}  		if (unlikely(tp_len < 0)) {  			if (po->tp_loss) {  				__packet_set_status(po, ph, @@ -1040,12 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)  			}  		} +		packet_pick_tx_queue(dev, skb); +  		skb->destructor = tpacket_destruct_skb;  		__packet_set_status(po, ph, TP_STATUS_SENDING); -		atomic_inc(&po->tx_ring.pending); +		packet_inc_pending(&po->tx_ring);  		status = TP_STATUS_SEND_REQUEST; -		err = dev_queue_xmit(skb); +		err = po->xmit(skb);  		if (unlikely(err > 0)) {  			err = net_xmit_errno(err);  			if (err && __packet_get_status(po, ph) == @@ -1063,9 +2336,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)  		packet_increment_head(&po->tx_ring);  		len_sum += tp_len;  	} while (likely((ph != NULL) || -			((!(msg->msg_flags & MSG_DONTWAIT)) && -			 (atomic_read(&po->tx_ring.pending)))) -		); +		/* Note: packet_read_pending() might be slow if we have +		 * to call it as it's per_cpu variable, but in fast-path +		 * we already short-circuit the loop with the first +		 * condition, and luckily don't have to go that path +		 * anyway. +		 */ +		 (need_wait && packet_read_pending(&po->tx_ring))));  	err = len_sum;  	goto out_put; @@ -1080,10 +2357,10 @@ out:  	return err;  } -static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, -					       size_t reserve, size_t len, -					       size_t linear, int noblock, -					       int *err) +static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, +				        size_t reserve, size_t len, +				        size_t linear, int noblock, +				        int *err)  {  	struct sk_buff *skb; @@ -1092,7 +2369,7 @@ static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,  		linear = len;  	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, -				   err); +				   err, 0);  	if (!skb)  		return NULL; @@ -1104,28 +2381,29 @@ static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,  	return skb;  } -static int packet_snd(struct socket *sock, -			  struct msghdr *msg, size_t len) +static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)  {  	struct sock *sk = sock->sk; -	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);  	struct sk_buff *skb;  	struct net_device *dev;  	__be16 proto;  	unsigned char *addr; -	int ifindex, err, reserve = 0; +	int err, reserve = 0;  	struct virtio_net_hdr vnet_hdr = { 0 };  	int offset = 0;  	int vnet_hdr_len;  	struct packet_sock *po = pkt_sk(sk);  	unsigned short gso_type = 0; +	int hlen, tlen; +	int extra_len = 0;  	/*  	 *	Get and verify the address.  	 */ -	if (saddr == NULL) { -		ifindex	= po->ifindex; +	if (likely(saddr == NULL)) { +		dev	= packet_cached_dev_get(po);  		proto	= po->num;  		addr	= NULL;  	} else { @@ -1134,23 +2412,20 @@ static int packet_snd(struct socket *sock,  			goto out;  		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))  			goto out; -		ifindex	= saddr->sll_ifindex;  		proto	= saddr->sll_protocol;  		addr	= saddr->sll_addr; +		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);  	} - -	dev = dev_get_by_index(sock_net(sk), ifindex);  	err = -ENXIO; -	if (dev == NULL) +	if (unlikely(dev == NULL))  		goto out_unlock; -	if (sock->type == SOCK_RAW) -		reserve = dev->hard_header_len; -  	err = -ENETDOWN; -	if (!(dev->flags & IFF_UP)) +	if (unlikely(!(dev->flags & IFF_UP)))  		goto out_unlock; +	if (sock->type == SOCK_RAW) +		reserve = dev->hard_header_len;  	if (po->has_vnet_hdr) {  		vnet_hdr_len = sizeof(vnet_hdr); @@ -1199,13 +2474,22 @@ static int packet_snd(struct socket *sock,  		}  	} +	if (unlikely(sock_flag(sk, SOCK_NOFCS))) { +		if (!netif_supports_nofcs(dev)) { +			err = -EPROTONOSUPPORT; +			goto out_unlock; +		} +		extra_len = 4; /* We're doing our own CRC */ +	} +  	err = -EMSGSIZE; -	if (!gso_type && (len > dev->mtu+reserve)) +	if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))  		goto out_unlock;  	err = -ENOBUFS; -	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), -			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, +	hlen = LL_RESERVED_SPACE(dev); +	tlen = dev->needed_tailroom; +	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,  			       msg->msg_flags & MSG_DONTWAIT, &err);  	if (skb == NULL)  		goto out_unlock; @@ -1221,15 +2505,30 @@ static int packet_snd(struct socket *sock,  	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);  	if (err)  		goto out_free; -	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); -	if (err < 0) -		goto out_free; + +	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + +	if (!gso_type && (len > dev->mtu + reserve + extra_len)) { +		/* Earlier code assumed this would be a VLAN pkt, +		 * double-check this now that we have the actual +		 * packet in hand. +		 */ +		struct ethhdr *ehdr; +		skb_reset_mac_header(skb); +		ehdr = eth_hdr(skb); +		if (ehdr->h_proto != htons(ETH_P_8021Q)) { +			err = -EMSGSIZE; +			goto out_free; +		} +	}  	skb->protocol = proto;  	skb->dev = dev;  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; +	packet_pick_tx_queue(dev, skb); +  	if (po->has_vnet_hdr) {  		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {  			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, @@ -1249,11 +2548,12 @@ static int packet_snd(struct socket *sock,  		len += vnet_hdr_len;  	} -	/* -	 *	Now send it -	 */ +	if (!packet_use_direct_xmit(po)) +		skb_probe_transport_header(skb, reserve); +	if (unlikely(extra_len == 4)) +		skb->no_fcs = 1; -	err = dev_queue_xmit(skb); +	err = po->xmit(skb);  	if (err > 0 && (err = net_xmit_errno(err)) != 0)  		goto out_unlock; @@ -1275,6 +2575,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,  {  	struct sock *sk = sock->sk;  	struct packet_sock *po = pkt_sk(sk); +  	if (po->tx_ring.pg_vec)  		return tpacket_snd(po, msg);  	else @@ -1291,7 +2592,7 @@ static int packet_release(struct socket *sock)  	struct sock *sk = sock->sk;  	struct packet_sock *po;  	struct net *net; -	struct tpacket_req req; +	union tpacket_req_u req_u;  	if (!sk)  		return 0; @@ -1299,32 +2600,37 @@ static int packet_release(struct socket *sock)  	net = sock_net(sk);  	po = pkt_sk(sk); -	spin_lock_bh(&net->packet.sklist_lock); +	mutex_lock(&net->packet.sklist_lock);  	sk_del_node_init_rcu(sk); +	mutex_unlock(&net->packet.sklist_lock); + +	preempt_disable();  	sock_prot_inuse_add(net, sk->sk_prot, -1); -	spin_unlock_bh(&net->packet.sklist_lock); +	preempt_enable();  	spin_lock(&po->bind_lock); -	if (po->running) { -		/* -		 * Remove from protocol table -		 */ -		po->running = 0; -		po->num = 0; -		__dev_remove_pack(&po->prot_hook); -		__sock_put(sk); +	unregister_prot_hook(sk, false); +	packet_cached_dev_reset(po); + +	if (po->prot_hook.dev) { +		dev_put(po->prot_hook.dev); +		po->prot_hook.dev = NULL;  	}  	spin_unlock(&po->bind_lock);  	packet_flush_mclist(sk); -	memset(&req, 0, sizeof(req)); +	if (po->rx_ring.pg_vec) { +		memset(&req_u, 0, sizeof(req_u)); +		packet_set_ring(sk, &req_u, 1, 0); +	} -	if (po->rx_ring.pg_vec) -		packet_set_ring(sk, &req, 1, 0); +	if (po->tx_ring.pg_vec) { +		memset(&req_u, 0, sizeof(req_u)); +		packet_set_ring(sk, &req_u, 1, 1); +	} -	if (po->tx_ring.pg_vec) -		packet_set_ring(sk, &req, 1, 1); +	fanout_release(sk);  	synchronize_net();  	/* @@ -1336,6 +2642,7 @@ static int packet_release(struct socket *sock)  	/* Purge queues */  	skb_queue_purge(&sk->sk_receive_queue); +	packet_free_pending(po);  	sk_refcnt_debug_release(sk);  	sock_put(sk); @@ -1346,38 +2653,48 @@ static int packet_release(struct socket *sock)   *	Attach a packet hook.   */ -static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) +static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)  {  	struct packet_sock *po = pkt_sk(sk); -	/* -	 *	Detach an existing hook if present. -	 */ +	const struct net_device *dev_curr; +	__be16 proto_curr; +	bool need_rehook; -	lock_sock(sk); +	if (po->fanout) { +		if (dev) +			dev_put(dev); -	spin_lock(&po->bind_lock); -	if (po->running) { -		__sock_put(sk); -		po->running = 0; -		po->num = 0; -		spin_unlock(&po->bind_lock); -		dev_remove_pack(&po->prot_hook); -		spin_lock(&po->bind_lock); +		return -EINVAL;  	} -	po->num = protocol; -	po->prot_hook.type = protocol; -	po->prot_hook.dev = dev; +	lock_sock(sk); +	spin_lock(&po->bind_lock); + +	proto_curr = po->prot_hook.type; +	dev_curr = po->prot_hook.dev; + +	need_rehook = proto_curr != proto || dev_curr != dev; + +	if (need_rehook) { +		unregister_prot_hook(sk, true); + +		po->num = proto; +		po->prot_hook.type = proto; + +		if (po->prot_hook.dev) +			dev_put(po->prot_hook.dev); + +		po->prot_hook.dev = dev; -	po->ifindex = dev ? dev->ifindex : 0; +		po->ifindex = dev ? dev->ifindex : 0; +		packet_cached_dev_assign(po, dev); +	} -	if (protocol == 0) +	if (proto == 0 || !need_rehook)  		goto out_unlock;  	if (!dev || (dev->flags & IFF_UP)) { -		dev_add_pack(&po->prot_hook); -		sock_hold(sk); -		po->running = 1; +		register_prot_hook(sk);  	} else {  		sk->sk_err = ENETDOWN;  		if (!sock_flag(sk, SOCK_DEAD)) @@ -1411,10 +2728,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,  	strlcpy(name, uaddr->sa_data, sizeof(name));  	dev = dev_get_by_name(sock_net(sk), name); -	if (dev) { +	if (dev)  		err = packet_do_bind(sk, dev, pkt_sk(sk)->num); -		dev_put(dev); -	}  	return err;  } @@ -1442,8 +2757,6 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len  			goto out;  	}  	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); -	if (dev) -		dev_put(dev);  out:  	return err; @@ -1467,7 +2780,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,  	__be16 proto = (__force __be16)protocol; /* weird, but documented */  	int err; -	if (!capable(CAP_NET_RAW)) +	if (!ns_capable(net->user_ns, CAP_NET_RAW))  		return -EPERM;  	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&  	    sock->type != SOCK_PACKET) @@ -1489,6 +2802,13 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,  	po = pkt_sk(sk);  	sk->sk_family = PF_PACKET;  	po->num = proto; +	po->xmit = dev_queue_xmit; + +	err = packet_alloc_pending(po); +	if (err) +		goto out2; + +	packet_cached_dev_reset(po);  	sk->sk_destruct = packet_sock_destruct;  	sk_refcnt_debug_inc(sk); @@ -1508,62 +2828,20 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,  	if (proto) {  		po->prot_hook.type = proto; -		dev_add_pack(&po->prot_hook); -		sock_hold(sk); -		po->running = 1; +		register_prot_hook(sk);  	} -	spin_lock_bh(&net->packet.sklist_lock); +	mutex_lock(&net->packet.sklist_lock);  	sk_add_node_rcu(sk, &net->packet.sklist); +	mutex_unlock(&net->packet.sklist_lock); + +	preempt_disable();  	sock_prot_inuse_add(net, &packet_proto, 1); -	spin_unlock_bh(&net->packet.sklist_lock); +	preempt_enable();  	return 0; -out: -	return err; -} - -static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len) -{ -	struct sock_exterr_skb *serr; -	struct sk_buff *skb, *skb2; -	int copied, err; - -	err = -EAGAIN; -	skb = skb_dequeue(&sk->sk_error_queue); -	if (skb == NULL) -		goto out; - -	copied = skb->len; -	if (copied > len) { -		msg->msg_flags |= MSG_TRUNC; -		copied = len; -	} -	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); -	if (err) -		goto out_free_skb; - -	sock_recv_timestamp(msg, sk, skb); - -	serr = SKB_EXT_ERR(skb); -	put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP, -		 sizeof(serr->ee), &serr->ee); - -	msg->msg_flags |= MSG_ERRQUEUE; -	err = copied; - -	/* Reset and regenerate socket error */ -	spin_lock_bh(&sk->sk_error_queue.lock); -	sk->sk_err = 0; -	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { -		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; -		spin_unlock_bh(&sk->sk_error_queue.lock); -		sk->sk_error_report(sk); -	} else -		spin_unlock_bh(&sk->sk_error_queue.lock); - -out_free_skb: -	kfree_skb(skb); +out2: +	sk_free(sk);  out:  	return err;  } @@ -1579,7 +2857,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  	struct sock *sk = sock->sk;  	struct sk_buff *skb;  	int copied, err; -	struct sockaddr_ll *sll;  	int vnet_hdr_len = 0;  	err = -EINVAL; @@ -1593,7 +2870,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  #endif  	if (flags & MSG_ERRQUEUE) { -		err = packet_recv_error(sk, msg, len); +		err = sock_recv_errqueue(sk, msg, len, +					 SOL_PACKET, PACKET_TX_TIMESTAMP);  		goto out;  	} @@ -1650,9 +2928,10 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  		if (skb->ip_summed == CHECKSUM_PARTIAL) {  			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; -			vnet_hdr.csum_start = skb->csum_start - -							skb_headroom(skb); +			vnet_hdr.csum_start = skb_checksum_start_offset(skb);  			vnet_hdr.csum_offset = skb->csum_offset; +		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { +			vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;  		} /* else everything is zero */  		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, @@ -1661,22 +2940,10 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  			goto out_free;  	} -	/* -	 *	If the address length field is there to be filled in, we fill -	 *	it in now. -	 */ - -	sll = &PACKET_SKB_CB(skb)->sa.ll; -	if (sock->type == SOCK_PACKET) -		msg->msg_namelen = sizeof(struct sockaddr_pkt); -	else -		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); - -	/* -	 *	You lose any data beyond the buffer you gave. If it worries a -	 *	user program they can ask the device for its MTU anyway. +	/* You lose any data beyond the buffer you gave. If it worries +	 * a user program they can ask the device for its MTU +	 * anyway.  	 */ -  	copied = skb->len;  	if (copied > len) {  		copied = len; @@ -1689,9 +2956,21 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  	sock_recv_ts_and_drops(msg, sk, skb); -	if (msg->msg_name) +	if (msg->msg_name) { +		/* If the address length field is there to be filled +		 * in, we fill it in now. +		 */ +		if (sock->type == SOCK_PACKET) { +			__sockaddr_check_size(sizeof(struct sockaddr_pkt)); +			msg->msg_namelen = sizeof(struct sockaddr_pkt); +		} else { +			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; +			msg->msg_namelen = sll->sll_halen + +				offsetof(struct sockaddr_ll, sll_addr); +		}  		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,  		       msg->msg_namelen); +	}  	if (pkt_sk(sk)->auxdata) {  		struct tpacket_auxdata aux; @@ -1703,8 +2982,14 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,  		aux.tp_snaplen = skb->len;  		aux.tp_mac = 0;  		aux.tp_net = skb_network_offset(skb); -		aux.tp_vlan_tci = vlan_tx_tag_get(skb); - +		if (vlan_tx_tag_present(skb)) { +			aux.tp_vlan_tci = vlan_tx_tag_get(skb); +			aux.tp_vlan_tpid = ntohs(skb->vlan_proto); +			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; +		} else { +			aux.tp_vlan_tci = 0; +			aux.tp_vlan_tpid = 0; +		}  		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);  	} @@ -1730,12 +3015,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,  		return -EOPNOTSUPP;  	uaddr->sa_family = AF_PACKET; +	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));  	rcu_read_lock();  	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);  	if (dev) -		strncpy(uaddr->sa_data, dev->name, 14); -	else -		memset(uaddr->sa_data, 0, 14); +		strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));  	rcu_read_unlock();  	*uaddr_len = sizeof(*uaddr); @@ -1950,15 +3234,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv  	case PACKET_RX_RING:  	case PACKET_TX_RING:  	{ -		struct tpacket_req req; +		union tpacket_req_u req_u; +		int len; -		if (optlen < sizeof(req)) +		switch (po->tp_version) { +		case TPACKET_V1: +		case TPACKET_V2: +			len = sizeof(req_u.req); +			break; +		case TPACKET_V3: +		default: +			len = sizeof(req_u.req3); +			break; +		} +		if (optlen < len)  			return -EINVAL;  		if (pkt_sk(sk)->has_vnet_hdr)  			return -EINVAL; -		if (copy_from_user(&req, optval, sizeof(req))) +		if (copy_from_user(&req_u.req, optval, len))  			return -EFAULT; -		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); +		return packet_set_ring(sk, &req_u, 0, +			optname == PACKET_TX_RING);  	}  	case PACKET_COPY_THRESH:  	{ @@ -1985,6 +3281,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv  		switch (val) {  		case TPACKET_V1:  		case TPACKET_V2: +		case TPACKET_V3:  			po->tp_version = val;  			return 0;  		default: @@ -2069,6 +3366,42 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv  		po->tp_tstamp = val;  		return 0;  	} +	case PACKET_FANOUT: +	{ +		int val; + +		if (optlen != sizeof(val)) +			return -EINVAL; +		if (copy_from_user(&val, optval, sizeof(val))) +			return -EFAULT; + +		return fanout_add(sk, val & 0xffff, val >> 16); +	} +	case PACKET_TX_HAS_OFF: +	{ +		unsigned int val; + +		if (optlen != sizeof(val)) +			return -EINVAL; +		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) +			return -EBUSY; +		if (copy_from_user(&val, optval, sizeof(val))) +			return -EFAULT; +		po->tp_tx_has_off = !!val; +		return 0; +	} +	case PACKET_QDISC_BYPASS: +	{ +		int val; + +		if (optlen != sizeof(val)) +			return -EINVAL; +		if (copy_from_user(&val, optval, sizeof(val))) +			return -EFAULT; + +		po->xmit = val ? packet_direct_xmit : dev_queue_xmit; +		return 0; +	}  	default:  		return -ENOPROTOOPT;  	} @@ -2078,11 +3411,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,  			     char __user *optval, int __user *optlen)  {  	int len; -	int val; +	int val, lv = sizeof(val);  	struct sock *sk = sock->sk;  	struct packet_sock *po = pkt_sk(sk); -	void *data; -	struct tpacket_stats st; +	void *data = &val; +	union tpacket_stats_u st;  	if (level != SOL_PACKET)  		return -ENOPROTOOPT; @@ -2095,42 +3428,33 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,  	switch (optname) {  	case PACKET_STATISTICS: -		if (len > sizeof(struct tpacket_stats)) -			len = sizeof(struct tpacket_stats);  		spin_lock_bh(&sk->sk_receive_queue.lock); -		st = po->stats; -		memset(&po->stats, 0, sizeof(st)); +		memcpy(&st, &po->stats, sizeof(st)); +		memset(&po->stats, 0, sizeof(po->stats));  		spin_unlock_bh(&sk->sk_receive_queue.lock); -		st.tp_packets += st.tp_drops; -		data = &st; +		if (po->tp_version == TPACKET_V3) { +			lv = sizeof(struct tpacket_stats_v3); +			st.stats3.tp_packets += st.stats3.tp_drops; +			data = &st.stats3; +		} else { +			lv = sizeof(struct tpacket_stats); +			st.stats1.tp_packets += st.stats1.tp_drops; +			data = &st.stats1; +		} +  		break;  	case PACKET_AUXDATA: -		if (len > sizeof(int)) -			len = sizeof(int);  		val = po->auxdata; - -		data = &val;  		break;  	case PACKET_ORIGDEV: -		if (len > sizeof(int)) -			len = sizeof(int);  		val = po->origdev; - -		data = &val;  		break;  	case PACKET_VNET_HDR: -		if (len > sizeof(int)) -			len = sizeof(int);  		val = po->has_vnet_hdr; - -		data = &val;  		break;  	case PACKET_VERSION: -		if (len > sizeof(int)) -			len = sizeof(int);  		val = po->tp_version; -		data = &val;  		break;  	case PACKET_HDRLEN:  		if (len > sizeof(int)) @@ -2144,33 +3468,41 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,  		case TPACKET_V2:  			val = sizeof(struct tpacket2_hdr);  			break; +		case TPACKET_V3: +			val = sizeof(struct tpacket3_hdr); +			break;  		default:  			return -EINVAL;  		} -		data = &val;  		break;  	case PACKET_RESERVE: -		if (len > sizeof(unsigned int)) -			len = sizeof(unsigned int);  		val = po->tp_reserve; -		data = &val;  		break;  	case PACKET_LOSS: -		if (len > sizeof(unsigned int)) -			len = sizeof(unsigned int);  		val = po->tp_loss; -		data = &val;  		break;  	case PACKET_TIMESTAMP: -		if (len > sizeof(int)) -			len = sizeof(int);  		val = po->tp_tstamp; -		data = &val; +		break; +	case PACKET_FANOUT: +		val = (po->fanout ? +		       ((u32)po->fanout->id | +			((u32)po->fanout->type << 16) | +			((u32)po->fanout->flags << 24)) : +		       0); +		break; +	case PACKET_TX_HAS_OFF: +		val = po->tp_tx_has_off; +		break; +	case PACKET_QDISC_BYPASS: +		val = packet_use_direct_xmit(po);  		break;  	default:  		return -ENOPROTOOPT;  	} +	if (len > lv) +		len = lv;  	if (put_user(len, optlen))  		return -EFAULT;  	if (copy_to_user(optval, data, len)) @@ -2179,15 +3511,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,  } -static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +static int packet_notifier(struct notifier_block *this, +			   unsigned long msg, void *ptr)  {  	struct sock *sk; -	struct hlist_node *node; -	struct net_device *dev = data; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct net *net = dev_net(dev);  	rcu_read_lock(); -	sk_for_each_rcu(sk, node, &net->packet.sklist) { +	sk_for_each_rcu(sk, &net->packet.sklist) {  		struct packet_sock *po = pkt_sk(sk);  		switch (msg) { @@ -2200,15 +3532,16 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void  			if (dev->ifindex == po->ifindex) {  				spin_lock(&po->bind_lock);  				if (po->running) { -					__dev_remove_pack(&po->prot_hook); -					__sock_put(sk); -					po->running = 0; +					__unregister_prot_hook(sk, false);  					sk->sk_err = ENETDOWN;  					if (!sock_flag(sk, SOCK_DEAD))  						sk->sk_error_report(sk);  				}  				if (msg == NETDEV_UNREGISTER) { +					packet_cached_dev_reset(po);  					po->ifindex = -1; +					if (po->prot_hook.dev) +						dev_put(po->prot_hook.dev);  					po->prot_hook.dev = NULL;  				}  				spin_unlock(&po->bind_lock); @@ -2217,11 +3550,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void  		case NETDEV_UP:  			if (dev->ifindex == po->ifindex) {  				spin_lock(&po->bind_lock); -				if (po->num && !po->running) { -					dev_add_pack(&po->prot_hook); -					sock_hold(sk); -					po->running = 1; -				} +				if (po->num) +					register_prot_hook(sk);  				spin_unlock(&po->bind_lock);  			}  			break; @@ -2294,7 +3624,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,  	spin_lock_bh(&sk->sk_receive_queue.lock);  	if (po->rx_ring.pg_vec) { -		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) +		if (!packet_previous_rx_frame(po, &po->rx_ring, +			TP_STATUS_KERNEL))  			mask |= POLLIN | POLLRDNORM;  	}  	spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -2355,36 +3686,28 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order,  	kfree(pg_vec);  } -static inline char *alloc_one_pg_vec_page(unsigned long order) +static char *alloc_one_pg_vec_page(unsigned long order)  { -	char *buffer = NULL; +	char *buffer;  	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |  			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;  	buffer = (char *) __get_free_pages(gfp_flags, order); -  	if (buffer)  		return buffer; -	/* -	 * __get_free_pages failed, fall back to vmalloc -	 */ +	/* __get_free_pages failed, fall back to vmalloc */  	buffer = vzalloc((1 << order) * PAGE_SIZE); -  	if (buffer)  		return buffer; -	/* -	 * vmalloc failed, lets dig into swap here -	 */ +	/* vmalloc failed, lets dig into swap here */  	gfp_flags &= ~__GFP_NORETRY; -	buffer = (char *)__get_free_pages(gfp_flags, order); +	buffer = (char *) __get_free_pages(gfp_flags, order);  	if (buffer)  		return buffer; -	/* -	 * complete and utter failure -	 */ +	/* complete and utter failure */  	return NULL;  } @@ -2413,7 +3736,7 @@ out_free_pgvec:  	goto out;  } -static int packet_set_ring(struct sock *sk, struct tpacket_req *req, +static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,  		int closing, int tx_ring)  {  	struct pgv *pg_vec = NULL; @@ -2422,7 +3745,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  	struct packet_ring_buffer *rb;  	struct sk_buff_head *rb_queue;  	__be16 num; -	int err; +	int err = -EINVAL; +	/* Added to avoid minimal code churn */ +	struct tpacket_req *req = &req_u->req; + +	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */ +	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { +		WARN(1, "Tx-ring is not supported.\n"); +		goto out; +	}  	rb = tx_ring ? &po->tx_ring : &po->rx_ring;  	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; @@ -2431,7 +3762,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  	if (!closing) {  		if (atomic_read(&po->mapped))  			goto out; -		if (atomic_read(&rb->pending)) +		if (packet_read_pending(rb))  			goto out;  	} @@ -2448,6 +3779,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  		case TPACKET_V2:  			po->tp_hdrlen = TPACKET2_HDRLEN;  			break; +		case TPACKET_V3: +			po->tp_hdrlen = TPACKET3_HDRLEN; +			break;  		}  		err = -EINVAL; @@ -2473,6 +3807,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  		pg_vec = alloc_pg_vec(req, order);  		if (unlikely(!pg_vec))  			goto out; +		switch (po->tp_version) { +		case TPACKET_V3: +		/* Transmit path is not supported. We checked +		 * it above but just being paranoid +		 */ +			if (!tx_ring) +				init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); +			break; +		default: +			break; +		}  	}  	/* Done */  	else { @@ -2488,10 +3833,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  	was_running = po->running;  	num = po->num;  	if (was_running) { -		__dev_remove_pack(&po->prot_hook);  		po->num = 0; -		po->running = 0; -		__sock_put(sk); +		__unregister_prot_hook(sk, false);  	}  	spin_unlock(&po->bind_lock); @@ -2501,22 +3844,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  	mutex_lock(&po->pg_vec_lock);  	if (closing || atomic_read(&po->mapped) == 0) {  		err = 0; -#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })  		spin_lock_bh(&rb_queue->lock); -		pg_vec = XC(rb->pg_vec, pg_vec); +		swap(rb->pg_vec, pg_vec);  		rb->frame_max = (req->tp_frame_nr - 1);  		rb->head = 0;  		rb->frame_size = req->tp_frame_size;  		spin_unlock_bh(&rb_queue->lock); -		order = XC(rb->pg_vec_order, order); -		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); +		swap(rb->pg_vec_order, order); +		swap(rb->pg_vec_len, req->tp_block_nr);  		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;  		po->prot_hook.func = (po->rx_ring.pg_vec) ?  						tpacket_rcv : packet_rcv;  		skb_queue_purge(rb_queue); -#undef XC  		if (atomic_read(&po->mapped))  			pr_err("packet_mmap: vma is busy: %d\n",  			       atomic_read(&po->mapped)); @@ -2524,14 +3865,16 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,  	mutex_unlock(&po->pg_vec_lock);  	spin_lock(&po->bind_lock); -	if (was_running && !po->running) { -		sock_hold(sk); -		po->running = 1; +	if (was_running) {  		po->num = num; -		dev_add_pack(&po->prot_hook); +		register_prot_hook(sk);  	}  	spin_unlock(&po->bind_lock); - +	if (closing && (po->tp_version > TPACKET_V2)) { +		/* Because we don't support block-based V3 on tx-ring */ +		if (!tx_ring) +			prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); +	}  	release_sock(sk);  	if (pg_vec) @@ -2686,7 +4029,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)  		const struct packet_sock *po = pkt_sk(s);  		seq_printf(seq, -			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n", +			   "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",  			   s,  			   atomic_read(&s->sk_refcnt),  			   s->sk_type, @@ -2694,7 +4037,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)  			   po->ifindex,  			   po->running,  			   atomic_read(&s->sk_rmem_alloc), -			   sock_i_uid(s), +			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),  			   sock_i_ino(s));  	} @@ -2726,10 +4069,10 @@ static const struct file_operations packet_seq_fops = {  static int __net_init packet_net_init(struct net *net)  { -	spin_lock_init(&net->packet.sklist_lock); +	mutex_init(&net->packet.sklist_lock);  	INIT_HLIST_HEAD(&net->packet.sklist); -	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) +	if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))  		return -ENOMEM;  	return 0; @@ -2737,7 +4080,7 @@ static int __net_init packet_net_init(struct net *net)  static void __net_exit packet_net_exit(struct net *net)  { -	proc_net_remove(net, "packet"); +	remove_proc_entry("packet", net->proc_net);  }  static struct pernet_operations packet_net_ops = { diff --git a/net/packet/diag.c b/net/packet/diag.c new file mode 100644 index 00000000000..92f2c7107ee --- /dev/null +++ b/net/packet/diag.c @@ -0,0 +1,264 @@ +#include <linux/module.h> +#include <linux/sock_diag.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/packet_diag.h> +#include <linux/percpu.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +#include "internal.h" + +static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) +{ +	struct packet_diag_info pinfo; + +	pinfo.pdi_index = po->ifindex; +	pinfo.pdi_version = po->tp_version; +	pinfo.pdi_reserve = po->tp_reserve; +	pinfo.pdi_copy_thresh = po->copy_thresh; +	pinfo.pdi_tstamp = po->tp_tstamp; + +	pinfo.pdi_flags = 0; +	if (po->running) +		pinfo.pdi_flags |= PDI_RUNNING; +	if (po->auxdata) +		pinfo.pdi_flags |= PDI_AUXDATA; +	if (po->origdev) +		pinfo.pdi_flags |= PDI_ORIGDEV; +	if (po->has_vnet_hdr) +		pinfo.pdi_flags |= PDI_VNETHDR; +	if (po->tp_loss) +		pinfo.pdi_flags |= PDI_LOSS; + +	return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo); +} + +static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb) +{ +	struct nlattr *mca; +	struct packet_mclist *ml; + +	mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST); +	if (!mca) +		return -EMSGSIZE; + +	rtnl_lock(); +	for (ml = po->mclist; ml; ml = ml->next) { +		struct packet_diag_mclist *dml; + +		dml = nla_reserve_nohdr(nlskb, sizeof(*dml)); +		if (!dml) { +			rtnl_unlock(); +			nla_nest_cancel(nlskb, mca); +			return -EMSGSIZE; +		} + +		dml->pdmc_index = ml->ifindex; +		dml->pdmc_type = ml->type; +		dml->pdmc_alen = ml->alen; +		dml->pdmc_count = ml->count; +		BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr)); +		memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr)); +	} + +	rtnl_unlock(); +	nla_nest_end(nlskb, mca); + +	return 0; +} + +static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, +		struct sk_buff *nlskb) +{ +	struct packet_diag_ring pdr; + +	if (!ring->pg_vec || ((ver > TPACKET_V2) && +				(nl_type == PACKET_DIAG_TX_RING))) +		return 0; + +	pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; +	pdr.pdr_block_nr = ring->pg_vec_len; +	pdr.pdr_frame_size = ring->frame_size; +	pdr.pdr_frame_nr = ring->frame_max + 1; + +	if (ver > TPACKET_V2) { +		pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; +		pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; +		pdr.pdr_features = ring->prb_bdqc.feature_req_word; +	} else { +		pdr.pdr_retire_tmo = 0; +		pdr.pdr_sizeof_priv = 0; +		pdr.pdr_features = 0; +	} + +	return nla_put(nlskb, nl_type, sizeof(pdr), &pdr); +} + +static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb) +{ +	int ret; + +	mutex_lock(&po->pg_vec_lock); +	ret = pdiag_put_ring(&po->rx_ring, po->tp_version, +			PACKET_DIAG_RX_RING, skb); +	if (!ret) +		ret = pdiag_put_ring(&po->tx_ring, po->tp_version, +				PACKET_DIAG_TX_RING, skb); +	mutex_unlock(&po->pg_vec_lock); + +	return ret; +} + +static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) +{ +	int ret = 0; + +	mutex_lock(&fanout_mutex); +	if (po->fanout) { +		u32 val; + +		val = (u32)po->fanout->id | ((u32)po->fanout->type << 16); +		ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val); +	} +	mutex_unlock(&fanout_mutex); + +	return ret; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, +			struct packet_diag_req *req, +			bool may_report_filterinfo, +			struct user_namespace *user_ns, +			u32 portid, u32 seq, u32 flags, int sk_ino) +{ +	struct nlmsghdr *nlh; +	struct packet_diag_msg *rp; +	struct packet_sock *po = pkt_sk(sk); + +	nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags); +	if (!nlh) +		return -EMSGSIZE; + +	rp = nlmsg_data(nlh); +	rp->pdiag_family = AF_PACKET; +	rp->pdiag_type = sk->sk_type; +	rp->pdiag_num = ntohs(po->num); +	rp->pdiag_ino = sk_ino; +	sock_diag_save_cookie(sk, rp->pdiag_cookie); + +	if ((req->pdiag_show & PACKET_SHOW_INFO) && +			pdiag_put_info(po, skb)) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_INFO) && +	    nla_put_u32(skb, PACKET_DIAG_UID, +			from_kuid_munged(user_ns, sock_i_uid(sk)))) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_MCLIST) && +			pdiag_put_mclist(po, skb)) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_RING_CFG) && +			pdiag_put_rings_cfg(po, skb)) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_FANOUT) && +			pdiag_put_fanout(po, skb)) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_MEMINFO) && +	    sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO)) +		goto out_nlmsg_trim; + +	if ((req->pdiag_show & PACKET_SHOW_FILTER) && +	    sock_diag_put_filterinfo(may_report_filterinfo, sk, skb, +				     PACKET_DIAG_FILTER)) +		goto out_nlmsg_trim; + +	return nlmsg_end(skb, nlh); + +out_nlmsg_trim: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} + +static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int num = 0, s_num = cb->args[0]; +	struct packet_diag_req *req; +	struct net *net; +	struct sock *sk; +	bool may_report_filterinfo; + +	net = sock_net(skb->sk); +	req = nlmsg_data(cb->nlh); +	may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + +	mutex_lock(&net->packet.sklist_lock); +	sk_for_each(sk, &net->packet.sklist) { +		if (!net_eq(sock_net(sk), net)) +			continue; +		if (num < s_num) +			goto next; + +		if (sk_diag_fill(sk, skb, req, +				 may_report_filterinfo, +				 sk_user_ns(NETLINK_CB(cb->skb).sk), +				 NETLINK_CB(cb->skb).portid, +				 cb->nlh->nlmsg_seq, NLM_F_MULTI, +				 sock_i_ino(sk)) < 0) +			goto done; +next: +		num++; +	} +done: +	mutex_unlock(&net->packet.sklist_lock); +	cb->args[0] = num; + +	return skb->len; +} + +static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ +	int hdrlen = sizeof(struct packet_diag_req); +	struct net *net = sock_net(skb->sk); +	struct packet_diag_req *req; + +	if (nlmsg_len(h) < hdrlen) +		return -EINVAL; + +	req = nlmsg_data(h); +	/* Make it possible to support protocol filtering later */ +	if (req->sdiag_protocol) +		return -EINVAL; + +	if (h->nlmsg_flags & NLM_F_DUMP) { +		struct netlink_dump_control c = { +			.dump = packet_diag_dump, +		}; +		return netlink_dump_start(net->diag_nlsk, skb, h, &c); +	} else +		return -EOPNOTSUPP; +} + +static const struct sock_diag_handler packet_diag_handler = { +	.family = AF_PACKET, +	.dump = packet_diag_handler_dump, +}; + +static int __init packet_diag_init(void) +{ +	return sock_diag_register(&packet_diag_handler); +} + +static void __exit packet_diag_exit(void) +{ +	sock_diag_unregister(&packet_diag_handler); +} + +module_init(packet_diag_init); +module_exit(packet_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */); diff --git a/net/packet/internal.h b/net/packet/internal.h new file mode 100644 index 00000000000..eb9580a6b25 --- /dev/null +++ b/net/packet/internal.h @@ -0,0 +1,126 @@ +#ifndef __PACKET_INTERNAL_H__ +#define __PACKET_INTERNAL_H__ + +struct packet_mclist { +	struct packet_mclist	*next; +	int			ifindex; +	int			count; +	unsigned short		type; +	unsigned short		alen; +	unsigned char		addr[MAX_ADDR_LEN]; +}; + +/* kbdq - kernel block descriptor queue */ +struct tpacket_kbdq_core { +	struct pgv	*pkbdq; +	unsigned int	feature_req_word; +	unsigned int	hdrlen; +	unsigned char	reset_pending_on_curr_blk; +	unsigned char   delete_blk_timer; +	unsigned short	kactive_blk_num; +	unsigned short	blk_sizeof_priv; + +	/* last_kactive_blk_num: +	 * trick to see if user-space has caught up +	 * in order to avoid refreshing timer when every single pkt arrives. +	 */ +	unsigned short	last_kactive_blk_num; + +	char		*pkblk_start; +	char		*pkblk_end; +	int		kblk_size; +	unsigned int	knum_blocks; +	uint64_t	knxt_seq_num; +	char		*prev; +	char		*nxt_offset; +	struct sk_buff	*skb; + +	atomic_t	blk_fill_in_prog; + +	/* Default is set to 8ms */ +#define DEFAULT_PRB_RETIRE_TOV	(8) + +	unsigned short  retire_blk_tov; +	unsigned short  version; +	unsigned long	tov_in_jiffies; + +	/* timer to retire an outstanding block */ +	struct timer_list retire_blk_timer; +}; + +struct pgv { +	char *buffer; +}; + +struct packet_ring_buffer { +	struct pgv		*pg_vec; + +	unsigned int		head; +	unsigned int		frames_per_block; +	unsigned int		frame_size; +	unsigned int		frame_max; + +	unsigned int		pg_vec_order; +	unsigned int		pg_vec_pages; +	unsigned int		pg_vec_len; + +	unsigned int __percpu	*pending_refcnt; + +	struct tpacket_kbdq_core	prb_bdqc; +}; + +extern struct mutex fanout_mutex; +#define PACKET_FANOUT_MAX	256 + +struct packet_fanout { +#ifdef CONFIG_NET_NS +	struct net		*net; +#endif +	unsigned int		num_members; +	u16			id; +	u8			type; +	u8			flags; +	atomic_t		rr_cur; +	struct list_head	list; +	struct sock		*arr[PACKET_FANOUT_MAX]; +	int			next[PACKET_FANOUT_MAX]; +	spinlock_t		lock; +	atomic_t		sk_ref; +	struct packet_type	prot_hook ____cacheline_aligned_in_smp; +}; + +struct packet_sock { +	/* struct sock has to be the first member of packet_sock */ +	struct sock		sk; +	struct packet_fanout	*fanout; +	union  tpacket_stats_u	stats; +	struct packet_ring_buffer	rx_ring; +	struct packet_ring_buffer	tx_ring; +	int			copy_thresh; +	spinlock_t		bind_lock; +	struct mutex		pg_vec_lock; +	unsigned int		running:1,	/* prot_hook is attached*/ +				auxdata:1, +				origdev:1, +				has_vnet_hdr:1; +	int			ifindex;	/* bound device		*/ +	__be16			num; +	struct packet_mclist	*mclist; +	atomic_t		mapped; +	enum tpacket_versions	tp_version; +	unsigned int		tp_hdrlen; +	unsigned int		tp_reserve; +	unsigned int		tp_loss:1; +	unsigned int		tp_tx_has_off:1; +	unsigned int		tp_tstamp; +	struct net_device __rcu	*cached_dev; +	int			(*xmit)(struct sk_buff *skb); +	struct packet_type	prot_hook ____cacheline_aligned_in_smp; +}; + +static struct packet_sock *pkt_sk(struct sock *sk) +{ +	return (struct packet_sock *)sk; +} + +#endif  | 
