diff options
Diffstat (limited to 'net/ipv4/tcp.c')
| -rw-r--r-- | net/ipv4/tcp.c | 1502 | 
1 files changed, 715 insertions, 787 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2bb46d55f40..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -245,6 +245,8 @@   *	TCP_CLOSE		socket is finished   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/types.h> @@ -268,6 +270,7 @@  #include <linux/slab.h>  #include <net/icmp.h> +#include <net/inet_common.h>  #include <net/tcp.h>  #include <net/xfrm.h>  #include <net/ip.h> @@ -276,9 +279,14 @@  #include <asm/uaccess.h>  #include <asm/ioctls.h> +#include <net/busy_poll.h>  int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + +int sysctl_tcp_autocorking __read_mostly = 1; +  struct percpu_counter tcp_orphan_count;  EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -363,6 +371,61 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)  	return period;  } +/* Address-family independent initialization for a tcp_sock. + * + * NOTE: A lot of things set to zero explicitly by call to + *       sk_alloc() so need not be done here. + */ +void tcp_init_sock(struct sock *sk) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); + +	__skb_queue_head_init(&tp->out_of_order_queue); +	tcp_init_xmit_timers(sk); +	tcp_prequeue_init(tp); +	INIT_LIST_HEAD(&tp->tsq_node); + +	icsk->icsk_rto = TCP_TIMEOUT_INIT; +	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + +	/* So many TCP implementations out there (incorrectly) count the +	 * initial SYN frame in their delayed-ACK and congestion control +	 * algorithms that we must have the following bandaid to talk +	 * efficiently to them.  -DaveM +	 */ +	tp->snd_cwnd = TCP_INIT_CWND; + +	/* See draft-stevens-tcpca-spec-01 for discussion of the +	 * initialization of these values. +	 */ +	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +	tp->snd_cwnd_clamp = ~0; +	tp->mss_cache = TCP_MSS_DEFAULT; + +	tp->reordering = sysctl_tcp_reordering; +	tcp_enable_early_retrans(tp); +	icsk->icsk_ca_ops = &tcp_init_congestion_ops; + +	tp->tsoffset = 0; + +	sk->sk_state = TCP_CLOSE; + +	sk->sk_write_space = sk_stream_write_space; +	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + +	icsk->icsk_sync_mss = tcp_sync_mss; + +	sk->sk_sndbuf = sysctl_tcp_wmem[1]; +	sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + +	local_bh_disable(); +	sock_update_memcg(sk); +	sk_sockets_allocated_inc(sk); +	local_bh_enable(); +} +EXPORT_SYMBOL(tcp_init_sock); +  /*   *	Wait for a TCP event.   * @@ -374,7 +437,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  {  	unsigned int mask;  	struct sock *sk = sock->sk; -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); + +	sock_rps_record_flow(sk);  	sock_poll_wait(file, sk_sleep(sk), wait);  	if (sk->sk_state == TCP_LISTEN) @@ -419,8 +484,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  	if (sk->sk_shutdown & RCV_SHUTDOWN)  		mask |= POLLIN | POLLRDNORM | POLLRDHUP; -	/* Connected? */ -	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { +	/* Connected or passive Fast Open socket? */ +	if (sk->sk_state != TCP_SYN_SENT && +	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {  		int target = sock_rcvlowat(sk, 0, INT_MAX);  		if (tp->urg_seq == tp->copied_seq && @@ -435,7 +501,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  			mask |= POLLIN | POLLRDNORM;  		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { +			if (sk_stream_is_writeable(sk)) {  				mask |= POLLOUT | POLLWRNORM;  			} else {  /* send SIGIO later */  				set_bit(SOCK_ASYNC_NOSPACE, @@ -446,7 +512,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  				 * wspace test but before the flags are set,  				 * IO signal will be lost.  				 */ -				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) +				if (sk_stream_is_writeable(sk))  					mask |= POLLOUT | POLLWRNORM;  			}  		} else @@ -468,30 +534,29 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int answ; +	bool slow;  	switch (cmd) {  	case SIOCINQ:  		if (sk->sk_state == TCP_LISTEN)  			return -EINVAL; -		lock_sock(sk); +		slow = lock_sock_fast(sk);  		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))  			answ = 0;  		else if (sock_flag(sk, SOCK_URGINLINE) ||  			 !tp->urg_data ||  			 before(tp->urg_seq, tp->copied_seq) ||  			 !before(tp->urg_seq, tp->rcv_nxt)) { -			struct sk_buff *skb;  			answ = tp->rcv_nxt - tp->copied_seq; -			/* Subtract 1, if FIN is in queue. */ -			skb = skb_peek_tail(&sk->sk_receive_queue); -			if (answ && skb) -				answ -= tcp_hdr(skb)->fin; +			/* Subtract 1, if FIN was received */ +			if (answ && sock_flag(sk, SOCK_DONE)) +				answ--;  		} else  			answ = tp->urg_seq - tp->copied_seq; -		release_sock(sk); +		unlock_sock_fast(sk, slow);  		break;  	case SIOCATMARK:  		answ = tp->urg_data && tp->urg_seq == tp->copied_seq; @@ -505,6 +570,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  		else  			answ = tp->write_seq - tp->snd_una;  		break; +	case SIOCOUTQNSD: +		if (sk->sk_state == TCP_LISTEN) +			return -EINVAL; + +		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) +			answ = 0; +		else +			answ = tp->write_seq - tp->snd_nxt; +		break;  	default:  		return -ENOIOCTLCMD;  	} @@ -515,11 +589,11 @@ EXPORT_SYMBOL(tcp_ioctl);  static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)  { -	TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; +	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;  	tp->pushed_seq = tp->write_seq;  } -static inline int forced_push(struct tcp_sock *tp) +static inline bool forced_push(const struct tcp_sock *tp)  {  	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));  } @@ -531,7 +605,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)  	skb->csum    = 0;  	tcb->seq     = tcb->end_seq = tp->write_seq; -	tcb->flags   = TCPHDR_ACK; +	tcb->tcp_flags = TCPHDR_ACK;  	tcb->sacked  = 0;  	skb_header_release(skb);  	tcp_add_write_queue_tail(sk, skb); @@ -547,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)  		tp->snd_up = tp->write_seq;  } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, -			    int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, +				int size_goal) +{ +	return skb->len < size_goal && +	       sysctl_tcp_autocorking && +	       skb != tcp_write_queue_head(sk) && +	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, +		     int nonagle, int size_goal)  { -	if (tcp_send_head(sk)) { -		struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; -		if (!(flags & MSG_MORE) || forced_push(tp)) -			tcp_mark_push(tp, tcp_write_queue_tail(sk)); +	if (!tcp_send_head(sk)) +		return; -		tcp_mark_urg(tp, flags); -		__tcp_push_pending_frames(sk, mss_now, -					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); +	skb = tcp_write_queue_tail(sk); +	if (!(flags & MSG_MORE) || forced_push(tp)) +		tcp_mark_push(tp, skb); + +	tcp_mark_urg(tp, flags); + +	if (tcp_should_autocork(sk, skb, size_goal)) { + +		/* avoid atomic op if TSQ_THROTTLED bit is already set */ +		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { +			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +		} +		/* It is possible TX completion already happened +		 * before we set TSQ_THROTTLED. +		 */ +		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) +			return;  	} + +	if (flags & MSG_MORE) +		nonagle = TCP_NAGLE_CORK; + +	__tcp_push_pending_frames(sk, mss_now, nonagle);  }  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -692,11 +805,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)  	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);  	if (skb) {  		if (sk_wmem_schedule(sk, skb->truesize)) { +			skb_reserve(skb, sk->sk_prot->max_header);  			/*  			 * Make sure that we have exactly size bytes  			 * available to the caller, no more, no less.  			 */ -			skb_reserve(skb, skb_tailroom(skb) - size); +			skb->reserved_tailroom = skb->end - skb->tail - size;  			return skb;  		}  		__kfree_skb(skb); @@ -716,10 +830,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  	xmit_size_goal = mss_now;  	if (large_allowed && sk_can_gso(sk)) { -		xmit_size_goal = ((sk->sk_gso_max_size - 1) - -				  inet_csk(sk)->icsk_af_ops->net_header_len - -				  inet_csk(sk)->icsk_ext_hdr_len - -				  tp->tcp_header_len); +		u32 gso_size, hlen; + +		/* Maybe we should/could use sk->sk_prot->max_header here ? */ +		hlen = inet_csk(sk)->icsk_af_ops->net_header_len + +		       inet_csk(sk)->icsk_ext_hdr_len + +		       tp->tcp_header_len; + +		/* Goal is to send at least one packet per ms, +		 * not one big TSO packet every 100 ms. +		 * This preserves ACK clocking and is consistent +		 * with tcp_tso_should_defer() heuristic. +		 */ +		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); +		gso_size = max_t(u32, gso_size, +				 sysctl_tcp_min_tso_segs * mss_now); + +		xmit_size_goal = min_t(u32, gso_size, +				       sk->sk_gso_max_size - 1 - hlen);  		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); @@ -730,7 +858,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  			   old_size_goal + mss_now > xmit_size_goal)) {  			xmit_size_goal = old_size_goal;  		} else { -			tp->xmit_size_goal_segs = xmit_size_goal / mss_now; +			tp->xmit_size_goal_segs = +				min_t(u16, xmit_size_goal / mss_now, +				      sk->sk_gso_max_segs);  			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;  		}  	} @@ -748,8 +878,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)  	return mss_now;  } -static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, -			 size_t psize, int flags) +static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, +				size_t size, int flags)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int mss_now, size_goal; @@ -757,10 +887,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse  	ssize_t copied;  	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); -	/* Wait for a connection to finish. */ -	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) +	/* Wait for a connection to finish. One exception is TCP Fast Open +	 * (passive side) where data is allowed to be sent before a connection +	 * is fully established. +	 */ +	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +	    !tcp_passive_fastopen(sk)) {  		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)  			goto out_err; +	}  	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -771,12 +906,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse  	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))  		goto out_err; -	while (psize > 0) { +	while (size > 0) {  		struct sk_buff *skb = tcp_write_queue_tail(sk); -		struct page *page = pages[poffset / PAGE_SIZE]; -		int copy, i, can_coalesce; -		int offset = poffset % PAGE_SIZE; -		int size = min_t(size_t, psize, PAGE_SIZE - offset); +		int copy, i; +		bool can_coalesce;  		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {  new_segment: @@ -804,11 +937,12 @@ new_segment:  			goto wait_for_memory;  		if (can_coalesce) { -			skb_shinfo(skb)->frags[i - 1].size += copy; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  		} else {  			get_page(page);  			skb_fill_page_desc(skb, i, page, offset, copy);  		} +		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;  		skb->len += copy;  		skb->data_len += copy; @@ -821,11 +955,11 @@ new_segment:  		skb_shinfo(skb)->gso_segs = 0;  		if (!copied) -			TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; +			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;  		copied += copy; -		poffset += copy; -		if (!(psize -= copy)) +		offset += copy; +		if (!(size -= copy))  			goto out;  		if (skb->len < size_goal || (flags & MSG_OOB)) @@ -841,8 +975,8 @@ new_segment:  wait_for_sndbuf:  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory: -		if (copied) -			tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +		tcp_push(sk, flags & ~MSG_MORE, mss_now, +			 TCP_NAGLE_PUSH, size_goal);  		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  			goto do_error; @@ -851,8 +985,8 @@ wait_for_memory:  	}  out: -	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); +	if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);  	return copied;  do_error: @@ -873,26 +1007,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,  					flags);  	lock_sock(sk); -	TCP_CHECK_TIMER(sk); -	res = do_tcp_sendpages(sk, &page, offset, size, flags); -	TCP_CHECK_TIMER(sk); +	res = do_tcp_sendpages(sk, page, offset, size, flags);  	release_sock(sk);  	return res;  }  EXPORT_SYMBOL(tcp_sendpage); -#define TCP_PAGE(sk)	(sk->sk_sndmsg_page) -#define TCP_OFF(sk)	(sk->sk_sndmsg_off) - -static inline int select_size(struct sock *sk, int sg) +static inline int select_size(const struct sock *sk, bool sg)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	int tmp = tp->mss_cache;  	if (sg) { -		if (sk_can_gso(sk)) -			tmp = 0; -		else { +		if (sk_can_gso(sk)) { +			/* Small frames wont use a full page: +			 * Payload will immediately follow tcp header. +			 */ +			tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); +		} else {  			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);  			if (tmp >= pgbreak && @@ -904,28 +1036,88 @@ static inline int select_size(struct sock *sk, int sg)  	return tmp;  } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ +	if (tp->fastopen_req != NULL) { +		kfree(tp->fastopen_req); +		tp->fastopen_req = NULL; +	} +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, +				int *copied, size_t size) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int err, flags; + +	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) +		return -EOPNOTSUPP; +	if (tp->fastopen_req != NULL) +		return -EALREADY; /* Another Fast Open is in progress */ + +	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), +				   sk->sk_allocation); +	if (unlikely(tp->fastopen_req == NULL)) +		return -ENOBUFS; +	tp->fastopen_req->data = msg; +	tp->fastopen_req->size = size; + +	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; +	err = __inet_stream_connect(sk->sk_socket, msg->msg_name, +				    msg->msg_namelen, flags); +	*copied = tp->fastopen_req->copied; +	tcp_free_fastopen_req(tp); +	return err; +} +  int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		size_t size)  {  	struct iovec *iov;  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb; -	int iovlen, flags; -	int mss_now, size_goal; -	int sg, err, copied; +	int iovlen, flags, err, copied = 0; +	int mss_now = 0, size_goal, copied_syn = 0, offset = 0; +	bool sg;  	long timeo;  	lock_sock(sk); -	TCP_CHECK_TIMER(sk);  	flags = msg->msg_flags; +	if (flags & MSG_FASTOPEN) { +		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); +		if (err == -EINPROGRESS && copied_syn > 0) +			goto out; +		else if (err) +			goto out_err; +		offset = copied_syn; +	} +  	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); -	/* Wait for a connection to finish. */ -	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) +	/* Wait for a connection to finish. One exception is TCP Fast Open +	 * (passive side) where data is allowed to be sent before a connection +	 * is fully established. +	 */ +	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +	    !tcp_passive_fastopen(sk)) {  		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) +			goto do_error; +	} + +	if (unlikely(tp->repair)) { +		if (tp->repair_queue == TCP_RECV_QUEUE) { +			copied = tcp_send_rcvq(sk, msg, size); +			goto out_nopush; +		} + +		err = -EINVAL; +		if (tp->repair_queue == TCP_NO_QUEUE)  			goto out_err; +		/* 'common' sending to sendq */ +	} +  	/* This should be in poll */  	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -940,13 +1132,22 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))  		goto out_err; -	sg = sk->sk_route_caps & NETIF_F_SG; +	sg = !!(sk->sk_route_caps & NETIF_F_SG);  	while (--iovlen >= 0) {  		size_t seglen = iov->iov_len;  		unsigned char __user *from = iov->iov_base;  		iov++; +		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */ +			if (offset >= seglen) { +				offset -= seglen; +				continue; +			} +			seglen -= offset; +			from += offset; +			offset = 0; +		}  		while (seglen > 0) {  			int copy = 0; @@ -974,6 +1175,13 @@ new_segment:  					goto wait_for_memory;  				/* +				 * All packets are restored as if they have +				 * already been sent. +				 */ +				if (tp->repair) +					TCP_SKB_CB(skb)->when = tcp_time_stamp; + +				/*  				 * Check whether we can use HW checksum.  				 */  				if (sk->sk_route_caps & NETIF_F_ALL_CSUM) @@ -989,85 +1197,54 @@ new_segment:  				copy = seglen;  			/* Where to copy to? */ -			if (skb_tailroom(skb) > 0) { +			if (skb_availroom(skb) > 0) {  				/* We have some space in skb head. Superb! */ -				if (copy > skb_tailroom(skb)) -					copy = skb_tailroom(skb); -				if ((err = skb_add_data(skb, from, copy)) != 0) +				copy = min_t(int, copy, skb_availroom(skb)); +				err = skb_add_data_nocache(sk, skb, from, copy); +				if (err)  					goto do_fault;  			} else { -				int merge = 0; +				bool merge = true;  				int i = skb_shinfo(skb)->nr_frags; -				struct page *page = TCP_PAGE(sk); -				int off = TCP_OFF(sk); - -				if (skb_can_coalesce(skb, i, page, off) && -				    off != PAGE_SIZE) { -					/* We can extend the last page -					 * fragment. */ -					merge = 1; -				} else if (i == MAX_SKB_FRAGS || !sg) { -					/* Need to add new fragment and cannot -					 * do this because interface is non-SG, -					 * or because all the page slots are -					 * busy. */ -					tcp_mark_push(tp, skb); -					goto new_segment; -				} else if (page) { -					if (off == PAGE_SIZE) { -						put_page(page); -						TCP_PAGE(sk) = page = NULL; -						off = 0; +				struct page_frag *pfrag = sk_page_frag(sk); + +				if (!sk_page_frag_refill(sk, pfrag)) +					goto wait_for_memory; + +				if (!skb_can_coalesce(skb, i, pfrag->page, +						      pfrag->offset)) { +					if (i == MAX_SKB_FRAGS || !sg) { +						tcp_mark_push(tp, skb); +						goto new_segment;  					} -				} else -					off = 0; +					merge = false; +				} -				if (copy > PAGE_SIZE - off) -					copy = PAGE_SIZE - off; +				copy = min_t(int, copy, pfrag->size - pfrag->offset);  				if (!sk_wmem_schedule(sk, copy))  					goto wait_for_memory; -				if (!page) { -					/* Allocate new cache page. */ -					if (!(page = sk_stream_alloc_page(sk))) -						goto wait_for_memory; -				} - -				/* Time to copy data. We are close to -				 * the end! */ -				err = skb_copy_to_page(sk, from, skb, page, -						       off, copy); -				if (err) { -					/* If this page was new, give it to the -					 * socket so it does not get leaked. -					 */ -					if (!TCP_PAGE(sk)) { -						TCP_PAGE(sk) = page; -						TCP_OFF(sk) = 0; -					} +				err = skb_copy_to_page_nocache(sk, from, skb, +							       pfrag->page, +							       pfrag->offset, +							       copy); +				if (err)  					goto do_error; -				}  				/* Update the skb. */  				if (merge) { -					skb_shinfo(skb)->frags[i - 1].size += -									copy; +					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  				} else { -					skb_fill_page_desc(skb, i, page, off, copy); -					if (TCP_PAGE(sk)) { -						get_page(page); -					} else if (off + copy < PAGE_SIZE) { -						get_page(page); -						TCP_PAGE(sk) = page; -					} +					skb_fill_page_desc(skb, i, pfrag->page, +							   pfrag->offset, copy); +					get_page(pfrag->page);  				} - -				TCP_OFF(sk) = off + copy; +				pfrag->offset += copy;  			}  			if (!copied) -				TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; +				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;  			tp->write_seq += copy;  			TCP_SKB_CB(skb)->end_seq += copy; @@ -1078,7 +1255,7 @@ new_segment:  			if ((seglen -= copy) == 0 && iovlen == 0)  				goto out; -			if (skb->len < max || (flags & MSG_OOB)) +			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))  				continue;  			if (forced_push(tp)) { @@ -1092,7 +1269,8 @@ wait_for_sndbuf:  			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory:  			if (copied) -				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +				tcp_push(sk, flags & ~MSG_MORE, mss_now, +					 TCP_NAGLE_PUSH, size_goal);  			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  				goto do_error; @@ -1103,10 +1281,10 @@ wait_for_memory:  out:  	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); -	TCP_CHECK_TIMER(sk); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush:  	release_sock(sk); -	return copied; +	return copied + copied_syn;  do_fault:  	if (!skb->len) { @@ -1119,11 +1297,10 @@ do_fault:  	}  do_error: -	if (copied) +	if (copied + copied_syn)  		goto out;  out_err:  	err = sk_stream_error(sk, flags, err); -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return err;  } @@ -1178,6 +1355,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)  	return -EAGAIN;  } +static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) +{ +	struct sk_buff *skb; +	int copied = 0, err = 0; + +	/* XXX -- need to support SO_PEEK_OFF */ + +	skb_queue_walk(&sk->sk_write_queue, skb) { +		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); +		if (err) +			break; + +		copied += skb->len; +	} + +	return err ?: copied; +} +  /* Clean up the receive buffer for full frames taken by the user,   * then send an ACK if necessary.  COPIED is the number of bytes   * tcp_recvmsg has given to the user so far, it speeds up the @@ -1187,15 +1382,13 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)  void tcp_cleanup_rbuf(struct sock *sk, int copied)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int time_to_ack = 0; +	bool time_to_ack = false; -#if TCP_DEBUG  	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);  	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),  	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",  	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); -#endif  	if (inet_csk_ack_scheduled(sk)) {  		const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1215,7 +1408,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&  		       !icsk->icsk_ack.pingpong)) &&  		      !atomic_read(&sk->sk_rmem_alloc))) -			time_to_ack = 1; +			time_to_ack = true;  	}  	/* We send an ACK if we can now advertise a non-zero window @@ -1237,7 +1430,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  			 * "Lots" means "at least twice" here.  			 */  			if (new_window && new_window >= 2 * rcv_window_now) -				time_to_ack = 1; +				time_to_ack = true;  		}  	}  	if (time_to_ack) @@ -1273,12 +1466,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  		return;  	last_issued = tp->ucopy.dma_cookie; -	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +	dma_async_issue_pending(tp->ucopy.dma_chan);  	do { -		if (dma_async_memcpy_complete(tp->ucopy.dma_chan, +		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,  					      last_issued, &done, -					      &used) == DMA_SUCCESS) { +					      &used) == DMA_COMPLETE) {  			/* Safe to free early-copied skbs now */  			__skb_queue_purge(&sk->sk_async_wait_queue);  			break; @@ -1286,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  			struct sk_buff *skb;  			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&  			       (dma_async_is_complete(skb->dma_cookie, done, -						      used) == DMA_SUCCESS)) { +						      used) == DMA_COMPLETE)) {  				__skb_dequeue(&sk->sk_async_wait_queue);  				kfree_skb(skb);  			} @@ -1295,12 +1488,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  }  #endif -static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  {  	struct sk_buff *skb;  	u32 offset; -	skb_queue_walk(&sk->sk_receive_queue, skb) { +	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {  		offset = seq - TCP_SKB_CB(skb)->seq;  		if (tcp_hdr(skb)->syn)  			offset--; @@ -1308,6 +1501,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  			*off = offset;  			return skb;  		} +		/* This looks weird, but this can happen if TCP collapsing +		 * splitted a fat GRO packet, while we released socket lock +		 * in skb_splice_bits() +		 */ +		sk_eat_skb(sk, skb, false);  	}  	return NULL;  } @@ -1349,7 +1547,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  					break;  			}  			used = recv_actor(desc, skb, offset, len); -			if (used < 0) { +			if (used <= 0) {  				if (!copied)  					copied = used;  				break; @@ -1358,22 +1556,26 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  				copied += used;  				offset += used;  			} -			/* -			 * If recv_actor drops the lock (e.g. TCP splice +			/* If recv_actor drops the lock (e.g. TCP splice  			 * receive) the skb pointer might be invalid when  			 * getting here: tcp_collapse might have deleted it  			 * while aggregating skbs from the socket queue.  			 */ -			skb = tcp_recv_skb(sk, seq-1, &offset); -			if (!skb || (offset+1 != skb->len)) +			skb = tcp_recv_skb(sk, seq - 1, &offset); +			if (!skb)  				break; +			/* TCP coalescing might have appended data to the skb. +			 * Try to splice more frags +			 */ +			if (offset + 1 != skb->len) +				continue;  		}  		if (tcp_hdr(skb)->fin) { -			sk_eat_skb(sk, skb, 0); +			sk_eat_skb(sk, skb, false);  			++seq;  			break;  		} -		sk_eat_skb(sk, skb, 0); +		sk_eat_skb(sk, skb, false);  		if (!desc->count)  			break;  		tp->copied_seq = seq; @@ -1383,8 +1585,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  	tcp_rcv_space_adjust(sk);  	/* Clean up data we have read: This will do ACK frames. */ -	if (copied > 0) +	if (copied > 0) { +		tcp_recv_skb(sk, seq, &offset);  		tcp_cleanup_rbuf(sk, copied); +	}  	return copied;  }  EXPORT_SYMBOL(tcp_read_sock); @@ -1409,13 +1613,15 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	int target;		/* Read at least this many bytes */  	long timeo;  	struct task_struct *user_recv = NULL; -	int copied_early = 0; +	bool copied_early = false;  	struct sk_buff *skb;  	u32 urg_hole = 0; -	lock_sock(sk); +	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && +	    (sk->sk_state == TCP_ESTABLISHED)) +		sk_busy_loop(sk, nonblock); -	TCP_CHECK_TIMER(sk); +	lock_sock(sk);  	err = -ENOTCONN;  	if (sk->sk_state == TCP_LISTEN) @@ -1427,6 +1633,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (flags & MSG_OOB)  		goto recv_urg; +	if (unlikely(tp->repair)) { +		err = -EPERM; +		if (!(flags & MSG_PEEK)) +			goto out; + +		if (tp->repair_queue == TCP_SEND_QUEUE) +			goto recv_sndq; + +		err = -EINVAL; +		if (tp->repair_queue == TCP_NO_QUEUE) +			goto out; + +		/* 'common' recv queue MSG_PEEK-ing */ +	} +  	seq = &tp->copied_seq;  	if (flags & MSG_PEEK) {  		peek_seq = tp->copied_seq; @@ -1447,12 +1668,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		if ((available < target) &&  		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&  		    !sysctl_tcp_low_latency && -		    dma_find_channel(DMA_MEMCPY)) { -			preempt_enable_no_resched(); +		    net_dma_find_channel()) { +			preempt_enable();  			tp->ucopy.pinned_list =  					dma_pin_iovec_pages(msg->msg_iov, len);  		} else { -			preempt_enable_no_resched(); +			preempt_enable();  		}  	}  #endif @@ -1588,8 +1809,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		}  #ifdef CONFIG_NET_DMA -		if (tp->ucopy.dma_chan) -			dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +		if (tp->ucopy.dma_chan) { +			if (tp->rcv_wnd == 0 && +			    !skb_queue_empty(&sk->sk_async_wait_queue)) { +				tcp_service_net_dma(sk, true); +				tcp_cleanup_rbuf(sk, copied); +			} else +				dma_async_issue_pending(tp->ucopy.dma_chan); +		}  #endif  		if (copied >= target) {  			/* Do not sleep, just process backlog. */ @@ -1628,9 +1855,9 @@ do_prequeue:  		}  		if ((flags & MSG_PEEK) &&  		    (peek_seq - copied - urg_hole != tp->copied_seq)) { -			if (net_ratelimit()) -				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", -				       current->comm, task_pid_nr(current)); +			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", +					    current->comm, +					    task_pid_nr(current));  			peek_seq = tp->copied_seq;  		}  		continue; @@ -1662,7 +1889,7 @@ do_prequeue:  		if (!(flags & MSG_TRUNC)) {  #ifdef CONFIG_NET_DMA  			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) -				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); +				tp->ucopy.dma_chan = net_dma_find_channel();  			if (tp->ucopy.dma_chan) {  				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( @@ -1672,7 +1899,8 @@ do_prequeue:  				if (tp->ucopy.dma_cookie < 0) { -					printk(KERN_ALERT "dma_cookie < 0\n"); +					pr_alert("%s: dma_cookie < 0\n", +						 __func__);  					/* Exception. Bailout! */  					if (!copied) @@ -1680,10 +1908,10 @@ do_prequeue:  					break;  				} -				dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +				dma_async_issue_pending(tp->ucopy.dma_chan);  				if ((offset + used) == skb->len) -					copied_early = 1; +					copied_early = true;  			} else  #endif @@ -1717,7 +1945,7 @@ skip_copy:  			goto found_fin_ok;  		if (!(flags & MSG_PEEK)) {  			sk_eat_skb(sk, skb, copied_early); -			copied_early = 0; +			copied_early = false;  		}  		continue; @@ -1726,7 +1954,7 @@ skip_copy:  		++*seq;  		if (!(flags & MSG_PEEK)) {  			sk_eat_skb(sk, skb, copied_early); -			copied_early = 0; +			copied_early = false;  		}  		break;  	} while (len > 0); @@ -1767,18 +1995,20 @@ skip_copy:  	/* Clean up data we have read: This will do ACK frames. */  	tcp_cleanup_rbuf(sk, copied); -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return copied;  out: -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return err;  recv_urg:  	err = tcp_recv_urg(sk, msg, len, flags);  	goto out; + +recv_sndq: +	err = tcp_peek_sndq(sk, msg, len); +	goto out;  }  EXPORT_SYMBOL(tcp_recvmsg); @@ -1875,6 +2105,20 @@ void tcp_shutdown(struct sock *sk, int how)  }  EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ +	bool too_many_orphans, out_of_socket_memory; + +	too_many_orphans = tcp_too_many_orphans(sk, shift); +	out_of_socket_memory = tcp_out_of_memory(sk); + +	if (too_many_orphans) +		net_info_ratelimited("too many orphaned sockets\n"); +	if (out_of_socket_memory) +		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); +	return too_many_orphans || out_of_socket_memory; +} +  void tcp_close(struct sock *sk, long timeout)  {  	struct sk_buff *skb; @@ -1917,7 +2161,9 @@ void tcp_close(struct sock *sk, long timeout)  	 * advertise a zero window, then kill -9 the FTP client, wheee...  	 * Note: timeout is always zero in such a case.  	 */ -	if (data_was_unread) { +	if (unlikely(tcp_sk(sk)->repair)) { +		sk->sk_prot->disconnect(sk, 0); +	} else if (data_was_unread) {  		/* Unread data was tossed, zap the connection. */  		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);  		tcp_set_state(sk, TCP_CLOSE); @@ -1951,6 +2197,10 @@ void tcp_close(struct sock *sk, long timeout)  		 * they look as CLOSING or LAST_ACK for Linux)  		 * Probably, I missed some more holelets.  		 * 						--ANK +		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN +		 * in a single packet! (May consider it later but will +		 * probably need API support or TCP_CORK SYN-ACK until +		 * data is written and socket is closed.)  		 */  		tcp_send_fin(sk);  	} @@ -1982,7 +2232,7 @@ adjudge_to_death:  	/*	This is a (useful) BSD violating of the RFC. There is a  	 *	problem with TCP as specified in that the other end could  	 *	keep a socket open forever with no application left this end. -	 *	We use a 3 minute timeout (about the same as BSD) then kill +	 *	We use a 1 minute timeout (about the same as BSD) then kill  	 *	our end. If they send after that then tough - BUT: long enough  	 *	that we won't make the old 4*rto = almost no time - whoops  	 *	reset mistake. @@ -2014,10 +2264,7 @@ adjudge_to_death:  	}  	if (sk->sk_state != TCP_CLOSE) {  		sk_mem_reclaim(sk); -		if (tcp_too_many_orphans(sk, 0)) { -			if (net_ratelimit()) -				printk(KERN_INFO "TCP: too many of orphaned " -				       "sockets\n"); +		if (tcp_check_oom(sk, 0)) {  			tcp_set_state(sk, TCP_CLOSE);  			tcp_send_active_reset(sk, GFP_ATOMIC);  			NET_INC_STATS_BH(sock_net(sk), @@ -2025,8 +2272,16 @@ adjudge_to_death:  		}  	} -	if (sk->sk_state == TCP_CLOSE) +	if (sk->sk_state == TCP_CLOSE) { +		struct request_sock *req = tcp_sk(sk)->fastopen_rsk; +		/* We could get here with a non-NULL req if the socket is +		 * aborted (e.g., closed with unread data) before 3WHS +		 * finishes. +		 */ +		if (req != NULL) +			reqsk_fastopen_remove(sk, req, false);  		inet_csk_destroy_sock(sk); +	}  	/* Otherwise, socket is reprieved until protocol close. */  out: @@ -2038,7 +2293,7 @@ EXPORT_SYMBOL(tcp_close);  /* These states need RST on ABORT according to RFC793 */ -static inline int tcp_need_reset(int state) +static inline bool tcp_need_reset(int state)  {  	return (1 << state) &  	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | @@ -2059,6 +2314,8 @@ int tcp_disconnect(struct sock *sk, int flags)  	/* ABORT function of RFC793 */  	if (old_state == TCP_LISTEN) {  		inet_csk_listen_stop(sk); +	} else if (unlikely(tp->repair)) { +		sk->sk_err = ECONNABORTED;  	} else if (tcp_need_reset(old_state) ||  		   (tp->snd_nxt != tp->write_seq &&  		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -2085,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	sk->sk_shutdown = 0;  	sock_reset_flag(sk, SOCK_DONE); -	tp->srtt = 0; +	tp->srtt_us = 0;  	if ((tp->write_seq += tp->max_window + 2) == 0)  		tp->write_seq = 1;  	icsk->icsk_backoff = 0; @@ -2094,7 +2351,6 @@ int tcp_disconnect(struct sock *sk, int flags)  	tp->packets_out = 0;  	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;  	tp->snd_cwnd_cnt = 0; -	tp->bytes_acked = 0;  	tp->window_clamp = 0;  	tcp_set_ca_state(sk, TCP_CA_Open);  	tcp_clear_retrans(tp); @@ -2110,6 +2366,68 @@ int tcp_disconnect(struct sock *sk, int flags)  }  EXPORT_SYMBOL(tcp_disconnect); +void tcp_sock_destruct(struct sock *sk) +{ +	inet_sock_destruct(sk); + +	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); +} + +static inline bool tcp_can_repair_sock(const struct sock *sk) +{ +	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && +		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); +} + +static int tcp_repair_options_est(struct tcp_sock *tp, +		struct tcp_repair_opt __user *optbuf, unsigned int len) +{ +	struct tcp_repair_opt opt; + +	while (len >= sizeof(opt)) { +		if (copy_from_user(&opt, optbuf, sizeof(opt))) +			return -EFAULT; + +		optbuf++; +		len -= sizeof(opt); + +		switch (opt.opt_code) { +		case TCPOPT_MSS: +			tp->rx_opt.mss_clamp = opt.opt_val; +			break; +		case TCPOPT_WINDOW: +			{ +				u16 snd_wscale = opt.opt_val & 0xFFFF; +				u16 rcv_wscale = opt.opt_val >> 16; + +				if (snd_wscale > 14 || rcv_wscale > 14) +					return -EFBIG; + +				tp->rx_opt.snd_wscale = snd_wscale; +				tp->rx_opt.rcv_wscale = rcv_wscale; +				tp->rx_opt.wscale_ok = 1; +			} +			break; +		case TCPOPT_SACK_PERM: +			if (opt.opt_val != 0) +				return -EINVAL; + +			tp->rx_opt.sack_ok |= TCP_SACK_SEEN; +			if (sysctl_tcp_fack) +				tcp_enable_fack(tp); +			break; +		case TCPOPT_TIMESTAMP: +			if (opt.opt_val != 0) +				return -EINVAL; + +			tp->rx_opt.tstamp_ok = 1; +			break; +		} +	} + +	return 0; +} +  /*   *	Socket option code for TCP.   */ @@ -2140,92 +2458,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		release_sock(sk);  		return err;  	} -	case TCP_COOKIE_TRANSACTIONS: { -		struct tcp_cookie_transactions ctd; -		struct tcp_cookie_values *cvp = NULL; - -		if (sizeof(ctd) > optlen) -			return -EINVAL; -		if (copy_from_user(&ctd, optval, sizeof(ctd))) -			return -EFAULT; - -		if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || -		    ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) -			return -EINVAL; - -		if (ctd.tcpct_cookie_desired == 0) { -			/* default to global value */ -		} else if ((0x1 & ctd.tcpct_cookie_desired) || -			   ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || -			   ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { -			return -EINVAL; -		} - -		if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { -			/* Supercedes all other values */ -			lock_sock(sk); -			if (tp->cookie_values != NULL) { -				kref_put(&tp->cookie_values->kref, -					 tcp_cookie_values_release); -				tp->cookie_values = NULL; -			} -			tp->rx_opt.cookie_in_always = 0; /* false */ -			tp->rx_opt.cookie_out_never = 1; /* true */ -			release_sock(sk); -			return err; -		} - -		/* Allocate ancillary memory before locking. -		 */ -		if (ctd.tcpct_used > 0 || -		    (tp->cookie_values == NULL && -		     (sysctl_tcp_cookie_size > 0 || -		      ctd.tcpct_cookie_desired > 0 || -		      ctd.tcpct_s_data_desired > 0))) { -			cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, -				      GFP_KERNEL); -			if (cvp == NULL) -				return -ENOMEM; - -			kref_init(&cvp->kref); -		} -		lock_sock(sk); -		tp->rx_opt.cookie_in_always = -			(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); -		tp->rx_opt.cookie_out_never = 0; /* false */ - -		if (tp->cookie_values != NULL) { -			if (cvp != NULL) { -				/* Changed values are recorded by a changed -				 * pointer, ensuring the cookie will differ, -				 * without separately hashing each value later. -				 */ -				kref_put(&tp->cookie_values->kref, -					 tcp_cookie_values_release); -			} else { -				cvp = tp->cookie_values; -			} -		} - -		if (cvp != NULL) { -			cvp->cookie_desired = ctd.tcpct_cookie_desired; - -			if (ctd.tcpct_used > 0) { -				memcpy(cvp->s_data_payload, ctd.tcpct_value, -				       ctd.tcpct_used); -				cvp->s_data_desired = ctd.tcpct_used; -				cvp->s_data_constant = 1; /* true */ -			} else { -				/* No constant payload data. */ -				cvp->s_data_desired = ctd.tcpct_s_data_desired; -				cvp->s_data_constant = 0; /* false */ -			} - -			tp->cookie_values = cvp; -		} -		release_sock(sk); -		return err; -	}  	default:  		/* fallthru */  		break; @@ -2244,7 +2476,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		/* Values greater than interface MTU won't take effect. However  		 * at the point when this call is done we typically don't yet  		 * know which interface is going to be used */ -		if (val < 64 || val > MAX_TCP_WINDOW) { +		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {  			err = -EINVAL;  			break;  		} @@ -2278,8 +2510,58 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  	case TCP_THIN_DUPACK:  		if (val < 0 || val > 1)  			err = -EINVAL; -		else +		else {  			tp->thin_dupack = val; +			if (tp->thin_dupack) +				tcp_disable_early_retrans(tp); +		} +		break; + +	case TCP_REPAIR: +		if (!tcp_can_repair_sock(sk)) +			err = -EPERM; +		else if (val == 1) { +			tp->repair = 1; +			sk->sk_reuse = SK_FORCE_REUSE; +			tp->repair_queue = TCP_NO_QUEUE; +		} else if (val == 0) { +			tp->repair = 0; +			sk->sk_reuse = SK_NO_REUSE; +			tcp_send_window_probe(sk); +		} else +			err = -EINVAL; + +		break; + +	case TCP_REPAIR_QUEUE: +		if (!tp->repair) +			err = -EPERM; +		else if (val < TCP_QUEUES_NR) +			tp->repair_queue = val; +		else +			err = -EINVAL; +		break; + +	case TCP_QUEUE_SEQ: +		if (sk->sk_state != TCP_CLOSE) +			err = -EPERM; +		else if (tp->repair_queue == TCP_SEND_QUEUE) +			tp->write_seq = val; +		else if (tp->repair_queue == TCP_RECV_QUEUE) +			tp->rcv_nxt = val; +		else +			err = -EINVAL; +		break; + +	case TCP_REPAIR_OPTIONS: +		if (!tp->repair) +			err = -EINVAL; +		else if (sk->sk_state == TCP_ESTABLISHED) +			err = tcp_repair_options_est(tp, +					(struct tcp_repair_opt __user *)optval, +					optlen); +		else +			err = -EPERM;  		break;  	case TCP_CORK: @@ -2394,7 +2676,28 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		/* Cap the max timeout in ms TCP will retry/retrans  		 * before giving up and aborting (ETIMEDOUT) a connection.  		 */ -		icsk->icsk_user_timeout = msecs_to_jiffies(val); +		if (val < 0) +			err = -EINVAL; +		else +			icsk->icsk_user_timeout = msecs_to_jiffies(val); +		break; + +	case TCP_FASTOPEN: +		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | +		    TCPF_LISTEN))) +			err = fastopen_init_queue(sk, val); +		else +			err = -EINVAL; +		break; +	case TCP_TIMESTAMP: +		if (!tp->repair) +			err = -EPERM; +		else +			tp->tsoffset = val - tcp_time_stamp; +		break; +	case TCP_NOTSENT_LOWAT: +		tp->notsent_lowat = val; +		sk->sk_write_space(sk);  		break;  	default:  		err = -ENOPROTOOPT; @@ -2408,7 +2711,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,  		   unsigned int optlen)  { -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	if (level != SOL_TCP)  		return icsk->icsk_af_ops->setsockopt(sk, level, optname, @@ -2430,9 +2733,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);  #endif  /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(struct sock *sk, struct tcp_info *info) +void tcp_get_info(const struct sock *sk, struct tcp_info *info)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u32 now = tcp_time_stamp; @@ -2454,8 +2757,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;  	} -	if (tp->ecn_flags&TCP_ECN_OK) +	if (tp->ecn_flags & TCP_ECN_OK)  		info->tcpi_options |= TCPI_OPT_ECN; +	if (tp->ecn_flags & TCP_ECN_SEEN) +		info->tcpi_options |= TCPI_OPT_ECN_SEEN; +	if (tp->syn_data_acked) +		info->tcpi_options |= TCPI_OPT_SYN_DATA;  	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);  	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); @@ -2479,8 +2786,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;  	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; -	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; -	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; +	info->tcpi_rtt = tp->srtt_us >> 3; +	info->tcpi_rttvar = tp->mdev_us >> 2;  	info->tcpi_snd_ssthresh = tp->snd_ssthresh;  	info->tcpi_snd_cwnd = tp->snd_cwnd;  	info->tcpi_advmss = tp->advmss; @@ -2490,6 +2797,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	info->tcpi_rcv_space = tp->rcvq_space.space;  	info->tcpi_total_retrans = tp->total_retrans; + +	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? +					sk->sk_pacing_rate : ~0ULL; +	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? +					sk->sk_max_pacing_rate : ~0ULL;  }  EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2513,6 +2825,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		val = tp->mss_cache;  		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))  			val = tp->rx_opt.user_mss; +		if (tp->repair) +			val = tp->rx_opt.mss_clamp;  		break;  	case TCP_NODELAY:  		val = !!(tp->nonagle&TCP_NAGLE_OFF); @@ -2573,41 +2887,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  			return -EFAULT;  		return 0; -	case TCP_COOKIE_TRANSACTIONS: { -		struct tcp_cookie_transactions ctd; -		struct tcp_cookie_values *cvp = tp->cookie_values; - -		if (get_user(len, optlen)) -			return -EFAULT; -		if (len < sizeof(ctd)) -			return -EINVAL; - -		memset(&ctd, 0, sizeof(ctd)); -		ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? -				   TCP_COOKIE_IN_ALWAYS : 0) -				| (tp->rx_opt.cookie_out_never ? -				   TCP_COOKIE_OUT_NEVER : 0); - -		if (cvp != NULL) { -			ctd.tcpct_flags |= (cvp->s_data_in ? -					    TCP_S_DATA_IN : 0) -					 | (cvp->s_data_out ? -					    TCP_S_DATA_OUT : 0); - -			ctd.tcpct_cookie_desired = cvp->cookie_desired; -			ctd.tcpct_s_data_desired = cvp->s_data_desired; - -			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], -			       cvp->cookie_pair_size); -			ctd.tcpct_used = cvp->cookie_pair_size; -		} - -		if (put_user(sizeof(ctd), optlen)) -			return -EFAULT; -		if (copy_to_user(optval, &ctd, sizeof(ctd))) -			return -EFAULT; -		return 0; -	}  	case TCP_THIN_LINEAR_TIMEOUTS:  		val = tp->thin_lto;  		break; @@ -2615,9 +2894,43 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		val = tp->thin_dupack;  		break; +	case TCP_REPAIR: +		val = tp->repair; +		break; + +	case TCP_REPAIR_QUEUE: +		if (tp->repair) +			val = tp->repair_queue; +		else +			return -EINVAL; +		break; + +	case TCP_QUEUE_SEQ: +		if (tp->repair_queue == TCP_SEND_QUEUE) +			val = tp->write_seq; +		else if (tp->repair_queue == TCP_RECV_QUEUE) +			val = tp->rcv_nxt; +		else +			return -EINVAL; +		break; +  	case TCP_USER_TIMEOUT:  		val = jiffies_to_msecs(icsk->icsk_user_timeout);  		break; + +	case TCP_FASTOPEN: +		if (icsk->icsk_accept_queue.fastopenq != NULL) +			val = icsk->icsk_accept_queue.fastopenq->max_qlen; +		else +			val = 0; +		break; + +	case TCP_TIMESTAMP: +		val = tcp_time_stamp + tp->tsoffset; +		break; +	case TCP_NOTSENT_LOWAT: +		val = tp->notsent_lowat; +		break;  	default:  		return -ENOPROTOOPT;  	} @@ -2653,313 +2966,62 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,  EXPORT_SYMBOL(compat_tcp_getsockopt);  #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) -{ -	struct sk_buff *segs = ERR_PTR(-EINVAL); -	struct tcphdr *th; -	unsigned thlen; -	unsigned int seq; -	__be32 delta; -	unsigned int oldlen; -	unsigned int mss; - -	if (!pskb_may_pull(skb, sizeof(*th))) -		goto out; - -	th = tcp_hdr(skb); -	thlen = th->doff * 4; -	if (thlen < sizeof(*th)) -		goto out; - -	if (!pskb_may_pull(skb, thlen)) -		goto out; - -	oldlen = (u16)~skb->len; -	__skb_pull(skb, thlen); - -	mss = skb_shinfo(skb)->gso_size; -	if (unlikely(skb->len <= mss)) -		goto out; - -	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { -		/* Packet is from an untrusted source, reset gso_segs. */ -		int type = skb_shinfo(skb)->gso_type; - -		if (unlikely(type & -			     ~(SKB_GSO_TCPV4 | -			       SKB_GSO_DODGY | -			       SKB_GSO_TCP_ECN | -			       SKB_GSO_TCPV6 | -			       0) || -			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) -			goto out; - -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - -		segs = NULL; -		goto out; -	} - -	segs = skb_segment(skb, features); -	if (IS_ERR(segs)) -		goto out; - -	delta = htonl(oldlen + (thlen + mss)); - -	skb = segs; -	th = tcp_hdr(skb); -	seq = ntohl(th->seq); - -	do { -		th->fin = th->psh = 0; - -		th->check = ~csum_fold((__force __wsum)((__force u32)th->check + -				       (__force u32)delta)); -		if (skb->ip_summed != CHECKSUM_PARTIAL) -			th->check = -			     csum_fold(csum_partial(skb_transport_header(skb), -						    thlen, skb->csum)); - -		seq += mss; -		skb = skb->next; -		th = tcp_hdr(skb); - -		th->seq = htonl(seq); -		th->cwr = 0; -	} while (skb->next); - -	delta = htonl(oldlen + (skb->tail - skb->transport_header) + -		      skb->data_len); -	th->check = ~csum_fold((__force __wsum)((__force u32)th->check + -				(__force u32)delta)); -	if (skb->ip_summed != CHECKSUM_PARTIAL) -		th->check = csum_fold(csum_partial(skb_transport_header(skb), -						   thlen, skb->csum)); - -out: -	return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); - -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ -	struct sk_buff **pp = NULL; -	struct sk_buff *p; -	struct tcphdr *th; -	struct tcphdr *th2; -	unsigned int len; -	unsigned int thlen; -	__be32 flags; -	unsigned int mss = 1; -	unsigned int hlen; -	unsigned int off; -	int flush = 1; -	int i; - -	off = skb_gro_offset(skb); -	hlen = off + sizeof(*th); -	th = skb_gro_header_fast(skb, off); -	if (skb_gro_header_hard(skb, hlen)) { -		th = skb_gro_header_slow(skb, hlen, off); -		if (unlikely(!th)) -			goto out; -	} - -	thlen = th->doff * 4; -	if (thlen < sizeof(*th)) -		goto out; - -	hlen = off + thlen; -	if (skb_gro_header_hard(skb, hlen)) { -		th = skb_gro_header_slow(skb, hlen, off); -		if (unlikely(!th)) -			goto out; -	} - -	skb_gro_pull(skb, thlen); - -	len = skb_gro_len(skb); -	flags = tcp_flag_word(th); - -	for (; (p = *head); head = &p->next) { -		if (!NAPI_GRO_CB(p)->same_flow) -			continue; - -		th2 = tcp_hdr(p); - -		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { -			NAPI_GRO_CB(p)->same_flow = 0; -			continue; -		} - -		goto found; -	} - -	goto out_check_final; - -found: -	flush = NAPI_GRO_CB(p)->flush; -	flush |= (__force int)(flags & TCP_FLAG_CWR); -	flush |= (__force int)((flags ^ tcp_flag_word(th2)) & -		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); -	flush |= (__force int)(th->ack_seq ^ th2->ack_seq); -	for (i = sizeof(*th); i < thlen; i += 4) -		flush |= *(u32 *)((u8 *)th + i) ^ -			 *(u32 *)((u8 *)th2 + i); - -	mss = skb_shinfo(p)->gso_size; - -	flush |= (len - 1) >= mss; -	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - -	if (flush || skb_gro_receive(head, skb)) { -		mss = 1; -		goto out_check_final; -	} - -	p = *head; -	th2 = tcp_hdr(p); -	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); - -out_check_final: -	flush = len < mss; -	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | -					TCP_FLAG_RST | TCP_FLAG_SYN | -					TCP_FLAG_FIN)); - -	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) -		pp = head; - -out: -	NAPI_GRO_CB(skb)->flush |= flush; - -	return pp; -} -EXPORT_SYMBOL(tcp_gro_receive); - -int tcp_gro_complete(struct sk_buff *skb) -{ -	struct tcphdr *th = tcp_hdr(skb); - -	skb->csum_start = skb_transport_header(skb) - skb->head; -	skb->csum_offset = offsetof(struct tcphdr, check); -	skb->ip_summed = CHECKSUM_PARTIAL; - -	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - -	if (th->cwr) -		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - -	return 0; -} -EXPORT_SYMBOL(tcp_gro_complete); -  #ifdef CONFIG_TCP_MD5SIG -static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; -static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_MUTEX(tcp_md5sig_mutex); -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)  {  	int cpu; -	for_each_possible_cpu(cpu) { -		struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); -		if (p) { -			if (p->md5_desc.tfm) -				crypto_free_hash(p->md5_desc.tfm); -			kfree(p); -		} -	} -	free_percpu(pool); -} -void tcp_free_md5sig_pool(void) -{ -	struct tcp_md5sig_pool * __percpu *pool = NULL; +	for_each_possible_cpu(cpu) { +		struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); -	spin_lock_bh(&tcp_md5sig_pool_lock); -	if (--tcp_md5sig_users == 0) { -		pool = tcp_md5sig_pool; -		tcp_md5sig_pool = NULL; +		if (p->md5_desc.tfm) +			crypto_free_hash(p->md5_desc.tfm);  	} -	spin_unlock_bh(&tcp_md5sig_pool_lock); -	if (pool) -		__tcp_free_md5sig_pool(pool); +	free_percpu(pool);  } -EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool * __percpu * -__tcp_alloc_md5sig_pool(struct sock *sk) +static void __tcp_alloc_md5sig_pool(void)  {  	int cpu; -	struct tcp_md5sig_pool * __percpu *pool; +	struct tcp_md5sig_pool __percpu *pool; -	pool = alloc_percpu(struct tcp_md5sig_pool *); +	pool = alloc_percpu(struct tcp_md5sig_pool);  	if (!pool) -		return NULL; +		return;  	for_each_possible_cpu(cpu) { -		struct tcp_md5sig_pool *p;  		struct crypto_hash *hash; -		p = kzalloc(sizeof(*p), sk->sk_allocation); -		if (!p) -			goto out_free; -		*per_cpu_ptr(pool, cpu) = p; -  		hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); -		if (!hash || IS_ERR(hash)) +		if (IS_ERR_OR_NULL(hash))  			goto out_free; -		p->md5_desc.tfm = hash; +		per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;  	} -	return pool; +	/* before setting tcp_md5sig_pool, we must commit all writes +	 * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() +	 */ +	smp_wmb(); +	tcp_md5sig_pool = pool; +	return;  out_free:  	__tcp_free_md5sig_pool(pool); -	return NULL;  } -struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +bool tcp_alloc_md5sig_pool(void)  { -	struct tcp_md5sig_pool * __percpu *pool; -	int alloc = 0; - -retry: -	spin_lock_bh(&tcp_md5sig_pool_lock); -	pool = tcp_md5sig_pool; -	if (tcp_md5sig_users++ == 0) { -		alloc = 1; -		spin_unlock_bh(&tcp_md5sig_pool_lock); -	} else if (!pool) { -		tcp_md5sig_users--; -		spin_unlock_bh(&tcp_md5sig_pool_lock); -		cpu_relax(); -		goto retry; -	} else -		spin_unlock_bh(&tcp_md5sig_pool_lock); - -	if (alloc) { -		/* we cannot hold spinlock here because this may sleep. */ -		struct tcp_md5sig_pool * __percpu *p; - -		p = __tcp_alloc_md5sig_pool(sk); -		spin_lock_bh(&tcp_md5sig_pool_lock); -		if (!p) { -			tcp_md5sig_users--; -			spin_unlock_bh(&tcp_md5sig_pool_lock); -			return NULL; -		} -		pool = tcp_md5sig_pool; -		if (pool) { -			/* oops, it has already been assigned. */ -			spin_unlock_bh(&tcp_md5sig_pool_lock); -			__tcp_free_md5sig_pool(p); -		} else { -			tcp_md5sig_pool = pool = p; -			spin_unlock_bh(&tcp_md5sig_pool_lock); -		} +	if (unlikely(!tcp_md5sig_pool)) { +		mutex_lock(&tcp_md5sig_mutex); + +		if (!tcp_md5sig_pool) +			__tcp_alloc_md5sig_pool(); + +		mutex_unlock(&tcp_md5sig_mutex);  	} -	return pool; +	return tcp_md5sig_pool != NULL;  }  EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -2973,56 +3035,45 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);   */  struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)  { -	struct tcp_md5sig_pool * __percpu *p; +	struct tcp_md5sig_pool __percpu *p;  	local_bh_disable(); - -	spin_lock(&tcp_md5sig_pool_lock); -	p = tcp_md5sig_pool; +	p = ACCESS_ONCE(tcp_md5sig_pool);  	if (p) -		tcp_md5sig_users++; -	spin_unlock(&tcp_md5sig_pool_lock); - -	if (p) -		return *this_cpu_ptr(p); +		return __this_cpu_ptr(p);  	local_bh_enable();  	return NULL;  }  EXPORT_SYMBOL(tcp_get_md5sig_pool); -void tcp_put_md5sig_pool(void) -{ -	local_bh_enable(); -	tcp_free_md5sig_pool(); -} -EXPORT_SYMBOL(tcp_put_md5sig_pool); -  int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, -			struct tcphdr *th) +			const struct tcphdr *th)  {  	struct scatterlist sg; +	struct tcphdr hdr;  	int err; -	__sum16 old_checksum = th->check; -	th->check = 0; +	/* We are not allowed to change tcphdr, make a local copy */ +	memcpy(&hdr, th, sizeof(hdr)); +	hdr.check = 0; +  	/* options aren't included in the hash */ -	sg_init_one(&sg, th, sizeof(struct tcphdr)); -	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); -	th->check = old_checksum; +	sg_init_one(&sg, &hdr, sizeof(hdr)); +	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));  	return err;  }  EXPORT_SYMBOL(tcp_md5_hash_header);  int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, -			  struct sk_buff *skb, unsigned header_len) +			  const struct sk_buff *skb, unsigned int header_len)  {  	struct scatterlist sg;  	const struct tcphdr *tp = tcp_hdr(skb);  	struct hash_desc *desc = &hp->md5_desc; -	unsigned i; -	const unsigned head_data_len = skb_headlen(skb) > header_len ? -				       skb_headlen(skb) - header_len : 0; +	unsigned int i; +	const unsigned int head_data_len = skb_headlen(skb) > header_len ? +					   skb_headlen(skb) - header_len : 0;  	const struct skb_shared_info *shi = skb_shinfo(skb);  	struct sk_buff *frag_iter; @@ -3034,8 +3085,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  	for (i = 0; i < shi->nr_frags; ++i) {  		const struct skb_frag_struct *f = &shi->frags[i]; -		sg_set_page(&sg, f->page, f->size, f->page_offset); -		if (crypto_hash_update(desc, &sg, f->size)) +		unsigned int offset = f->page_offset; +		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + +		sg_set_page(&sg, page, skb_frag_size(f), +			    offset_in_page(offset)); +		if (crypto_hash_update(desc, &sg, skb_frag_size(f)))  			return 1;  	} @@ -3047,7 +3102,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  }  EXPORT_SYMBOL(tcp_md5_hash_skb_data); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)  {  	struct scatterlist sg; @@ -3058,142 +3113,17 @@ EXPORT_SYMBOL(tcp_md5_hash_key);  #endif -/** - * Each Responder maintains up to two secret values concurrently for - * efficient secret rollover.  Each secret value has 4 states: - * - * Generating.  (tcp_secret_generating != tcp_secret_primary) - *    Generates new Responder-Cookies, but not yet used for primary - *    verification.  This is a short-term state, typically lasting only - *    one round trip time (RTT). - * - * Primary.  (tcp_secret_generating == tcp_secret_primary) - *    Used both for generation and primary verification. - * - * Retiring.  (tcp_secret_retiring != tcp_secret_secondary) - *    Used for verification, until the first failure that can be - *    verified by the newer Generating secret.  At that time, this - *    cookie's state is changed to Secondary, and the Generating - *    cookie's state is changed to Primary.  This is a short-term state, - *    typically lasting only one round trip time (RTT). - * - * Secondary.  (tcp_secret_retiring == tcp_secret_secondary) - *    Used for secondary verification, after primary verification - *    failures.  This state lasts no more than twice the Maximum Segment - *    Lifetime (2MSL).  Then, the secret is discarded. - */ -struct tcp_cookie_secret { -	/* The secret is divided into two parts.  The digest part is the -	 * equivalent of previously hashing a secret and saving the state, -	 * and serves as an initialization vector (IV).  The message part -	 * serves as the trailing secret. -	 */ -	u32				secrets[COOKIE_WORKSPACE_WORDS]; -	unsigned long			expires; -}; - -#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) -#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) -#define TCP_SECRET_LIFE (HZ * 600) - -static struct tcp_cookie_secret tcp_secret_one; -static struct tcp_cookie_secret tcp_secret_two; - -/* Essentially a circular list, without dynamic allocation. */ -static struct tcp_cookie_secret *tcp_secret_generating; -static struct tcp_cookie_secret *tcp_secret_primary; -static struct tcp_cookie_secret *tcp_secret_retiring; -static struct tcp_cookie_secret *tcp_secret_secondary; - -static DEFINE_SPINLOCK(tcp_secret_locker); - -/* Select a pseudo-random word in the cookie workspace. - */ -static inline u32 tcp_cookie_work(const u32 *ws, const int n) -{ -	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; -} - -/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. - * Called in softirq context. - * Returns: 0 for success. - */ -int tcp_cookie_generator(u32 *bakery) -{ -	unsigned long jiffy = jiffies; - -	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { -		spin_lock_bh(&tcp_secret_locker); -		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { -			/* refreshed by another */ -			memcpy(bakery, -			       &tcp_secret_generating->secrets[0], -			       COOKIE_WORKSPACE_WORDS); -		} else { -			/* still needs refreshing */ -			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); - -			/* The first time, paranoia assumes that the -			 * randomization function isn't as strong.  But, -			 * this secret initialization is delayed until -			 * the last possible moment (packet arrival). -			 * Although that time is observable, it is -			 * unpredictably variable.  Mash in the most -			 * volatile clock bits available, and expire the -			 * secret extra quickly. -			 */ -			if (unlikely(tcp_secret_primary->expires == -				     tcp_secret_secondary->expires)) { -				struct timespec tv; - -				getnstimeofday(&tv); -				bakery[COOKIE_DIGEST_WORDS+0] ^= -					(u32)tv.tv_nsec; - -				tcp_secret_secondary->expires = jiffy -					+ TCP_SECRET_1MSL -					+ (0x0f & tcp_cookie_work(bakery, 0)); -			} else { -				tcp_secret_secondary->expires = jiffy -					+ TCP_SECRET_LIFE -					+ (0xff & tcp_cookie_work(bakery, 1)); -				tcp_secret_primary->expires = jiffy -					+ TCP_SECRET_2MSL -					+ (0x1f & tcp_cookie_work(bakery, 2)); -			} -			memcpy(&tcp_secret_secondary->secrets[0], -			       bakery, COOKIE_WORKSPACE_WORDS); - -			rcu_assign_pointer(tcp_secret_generating, -					   tcp_secret_secondary); -			rcu_assign_pointer(tcp_secret_retiring, -					   tcp_secret_primary); -			/* -			 * Neither call_rcu() nor synchronize_rcu() needed. -			 * Retiring data is not freed.  It is replaced after -			 * further (locked) pointer updates, and a quiet time -			 * (minimum 1MSL, maximum LIFE - 2MSL). -			 */ -		} -		spin_unlock_bh(&tcp_secret_locker); -	} else { -		rcu_read_lock_bh(); -		memcpy(bakery, -		       &rcu_dereference(tcp_secret_generating)->secrets[0], -		       COOKIE_WORKSPACE_WORDS); -		rcu_read_unlock_bh(); -	} -	return 0; -} -EXPORT_SYMBOL(tcp_cookie_generator); -  void tcp_done(struct sock *sk)  { +	struct request_sock *req = tcp_sk(sk)->fastopen_rsk; +  	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);  	tcp_set_state(sk, TCP_CLOSE);  	tcp_clear_xmit_timers(sk); +	if (req != NULL) +		reqsk_fastopen_remove(sk, req, false);  	sk->sk_shutdown = SHUTDOWN_MASK; @@ -3209,19 +3139,34 @@ extern struct tcp_congestion_ops tcp_reno;  static __initdata unsigned long thash_entries;  static int __init set_thash_entries(char *str)  { +	ssize_t ret; +  	if (!str)  		return 0; -	thash_entries = simple_strtoul(str, &str, 0); + +	ret = kstrtoul(str, 0, &thash_entries); +	if (ret) +		return 0; +  	return 1;  }  __setup("thash_entries=", set_thash_entries); +static void tcp_init_mem(void) +{ +	unsigned long limit = nr_free_buffer_pages() / 8; +	limit = max(limit, 128UL); +	sysctl_tcp_mem[0] = limit / 4 * 3; +	sysctl_tcp_mem[1] = limit; +	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; +} +  void __init tcp_init(void)  {  	struct sk_buff *skb = NULL; -	unsigned long nr_pages, limit; -	int i, max_share, cnt; -	unsigned long jiffy = jiffies; +	unsigned long limit; +	int max_rshare, max_wshare, cnt; +	unsigned int i;  	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3241,29 +3186,28 @@ void __init tcp_init(void)  		alloc_large_system_hash("TCP established",  					sizeof(struct inet_ehash_bucket),  					thash_entries, -					(totalram_pages >= 128 * 1024) ? -					13 : 15, +					17, /* one slot per 128 KB of memory */  					0,  					NULL,  					&tcp_hashinfo.ehash_mask, +					0,  					thash_entries ? 0 : 512 * 1024); -	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { +	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); -		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); -	} +  	if (inet_ehash_locks_alloc(&tcp_hashinfo))  		panic("TCP: failed to alloc ehash_locks");  	tcp_hashinfo.bhash =  		alloc_large_system_hash("TCP bind",  					sizeof(struct inet_bind_hashbucket),  					tcp_hashinfo.ehash_mask + 1, -					(totalram_pages >= 128 * 1024) ? -					13 : 15, +					17, /* one slot per 128 KB of memory */  					0,  					&tcp_hashinfo.bhash_size,  					NULL, +					0,  					64 * 1024); -	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; +	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;  	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {  		spin_lock_init(&tcp_hashinfo.bhash[i].lock);  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); @@ -3276,42 +3220,26 @@ void __init tcp_init(void)  	sysctl_tcp_max_orphans = cnt / 2;  	sysctl_max_syn_backlog = max(128, cnt / 256); -	/* Set the pressure threshold to be a fraction of global memory that -	 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of -	 * memory, with a floor of 128 pages. -	 */ -	nr_pages = totalram_pages - totalhigh_pages; -	limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); -	limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); -	limit = max(limit, 128UL); -	sysctl_tcp_mem[0] = limit / 4 * 3; -	sysctl_tcp_mem[1] = limit; -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; - +	tcp_init_mem();  	/* Set per-socket limits to no more than 1/128 the pressure threshold */ -	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); -	max_share = min(4UL*1024*1024, limit); +	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +	max_wshare = min(4UL*1024*1024, limit); +	max_rshare = min(6UL*1024*1024, limit);  	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;  	sysctl_tcp_wmem[1] = 16*1024; -	sysctl_tcp_wmem[2] = max(64*1024, max_share); +	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);  	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;  	sysctl_tcp_rmem[1] = 87380; -	sysctl_tcp_rmem[2] = max(87380, max_share); +	sysctl_tcp_rmem[2] = max(87380, max_rshare); + +	pr_info("Hash tables configured (established %u bind %u)\n", +		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); -	printk(KERN_INFO "TCP: Hash tables configured " -	       "(established %u bind %u)\n", -	       tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); +	tcp_metrics_init();  	tcp_register_congestion_control(&tcp_reno); -	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); -	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); -	tcp_secret_one.expires = jiffy; /* past due */ -	tcp_secret_two.expires = jiffy; /* past due */ -	tcp_secret_generating = &tcp_secret_one; -	tcp_secret_primary = &tcp_secret_one; -	tcp_secret_retiring = &tcp_secret_two; -	tcp_secret_secondary = &tcp_secret_two; +	tcp_tasklet_init();  }  | 
