diff options
Diffstat (limited to 'net/ipv4/tcp.c')
| -rw-r--r-- | net/ipv4/tcp.c | 136 | 
1 files changed, 95 insertions, 41 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6e5617b9f9d..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,12 +285,16 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;  int sysctl_tcp_min_tso_segs __read_mostly = 2; +int sysctl_tcp_autocorking __read_mostly = 1; +  struct percpu_counter tcp_orphan_count;  EXPORT_SYMBOL_GPL(tcp_orphan_count); +long sysctl_tcp_mem[3] __read_mostly;  int sysctl_tcp_wmem[3] __read_mostly;  int sysctl_tcp_rmem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_mem);  EXPORT_SYMBOL(sysctl_tcp_rmem);  EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -377,13 +381,13 @@ void tcp_init_sock(struct sock *sk)  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	skb_queue_head_init(&tp->out_of_order_queue); +	__skb_queue_head_init(&tp->out_of_order_queue);  	tcp_init_xmit_timers(sk);  	tcp_prequeue_init(tp);  	INIT_LIST_HEAD(&tp->tsq_node);  	icsk->icsk_rto = TCP_TIMEOUT_INIT; -	tp->mdev = TCP_TIMEOUT_INIT; +	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);  	/* So many TCP implementations out there (incorrectly) count the  	 * initial SYN frame in their delayed-ACK and congestion control @@ -617,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)  		tp->snd_up = tp->write_seq;  } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, -			    int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, +				int size_goal)  { -	if (tcp_send_head(sk)) { -		struct tcp_sock *tp = tcp_sk(sk); +	return skb->len < size_goal && +	       sysctl_tcp_autocorking && +	       skb != tcp_write_queue_head(sk) && +	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, +		     int nonagle, int size_goal) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; + +	if (!tcp_send_head(sk)) +		return; + +	skb = tcp_write_queue_tail(sk); +	if (!(flags & MSG_MORE) || forced_push(tp)) +		tcp_mark_push(tp, skb); -		if (!(flags & MSG_MORE) || forced_push(tp)) -			tcp_mark_push(tp, tcp_write_queue_tail(sk)); +	tcp_mark_urg(tp, flags); -		tcp_mark_urg(tp, flags); -		__tcp_push_pending_frames(sk, mss_now, -					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); +	if (tcp_should_autocork(sk, skb, size_goal)) { + +		/* avoid atomic op if TSQ_THROTTLED bit is already set */ +		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { +			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +		} +		/* It is possible TX completion already happened +		 * before we set TSQ_THROTTLED. +		 */ +		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) +			return;  	} + +	if (flags & MSG_MORE) +		nonagle = TCP_NAGLE_CORK; + +	__tcp_push_pending_frames(sk, mss_now, nonagle);  }  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -806,12 +849,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  		xmit_size_goal = min_t(u32, gso_size,  				       sk->sk_gso_max_size - 1 - hlen); -		/* TSQ : try to have at least two segments in flight -		 * (one in NIC TX ring, another in Qdisc) -		 */ -		xmit_size_goal = min_t(u32, xmit_size_goal, -				       sysctl_tcp_limit_output_bytes >> 1); -  		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);  		/* We try hard to avoid divides here */ @@ -938,7 +975,8 @@ new_segment:  wait_for_sndbuf:  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory: -		tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +		tcp_push(sk, flags & ~MSG_MORE, mss_now, +			 TCP_NAGLE_PUSH, size_goal);  		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  			goto do_error; @@ -948,7 +986,7 @@ wait_for_memory:  out:  	if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) -		tcp_push(sk, flags, mss_now, tp->nonagle); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);  	return copied;  do_error: @@ -1006,7 +1044,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)  	}  } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, +				int *copied, size_t size)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int err, flags; @@ -1021,11 +1060,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)  	if (unlikely(tp->fastopen_req == NULL))  		return -ENOBUFS;  	tp->fastopen_req->data = msg; +	tp->fastopen_req->size = size;  	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;  	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,  				    msg->msg_namelen, flags); -	*size = tp->fastopen_req->copied; +	*copied = tp->fastopen_req->copied;  	tcp_free_fastopen_req(tp);  	return err;  } @@ -1045,7 +1085,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	flags = msg->msg_flags;  	if (flags & MSG_FASTOPEN) { -		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); +		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);  		if (err == -EINPROGRESS && copied_syn > 0)  			goto out;  		else if (err) @@ -1068,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (unlikely(tp->repair)) {  		if (tp->repair_queue == TCP_RECV_QUEUE) {  			copied = tcp_send_rcvq(sk, msg, size); -			goto out; +			goto out_nopush;  		}  		err = -EINVAL; @@ -1229,7 +1269,8 @@ wait_for_sndbuf:  			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory:  			if (copied) -				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +				tcp_push(sk, flags & ~MSG_MORE, mss_now, +					 TCP_NAGLE_PUSH, size_goal);  			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  				goto do_error; @@ -1240,7 +1281,8 @@ wait_for_memory:  out:  	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush:  	release_sock(sk);  	return copied + copied_syn; @@ -1429,7 +1471,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  	do {  		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,  					      last_issued, &done, -					      &used) == DMA_SUCCESS) { +					      &used) == DMA_COMPLETE) {  			/* Safe to free early-copied skbs now */  			__skb_queue_purge(&sk->sk_async_wait_queue);  			break; @@ -1437,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  			struct sk_buff *skb;  			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&  			       (dma_async_is_complete(skb->dma_cookie, done, -						      used) == DMA_SUCCESS)) { +						      used) == DMA_COMPLETE)) {  				__skb_dequeue(&sk->sk_async_wait_queue);  				kfree_skb(skb);  			} @@ -1627,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&  		    !sysctl_tcp_low_latency &&  		    net_dma_find_channel()) { -			preempt_enable_no_resched(); +			preempt_enable();  			tp->ucopy.pinned_list =  					dma_pin_iovec_pages(msg->msg_iov, len);  		} else { -			preempt_enable_no_resched(); +			preempt_enable();  		}  	}  #endif @@ -2190,7 +2232,7 @@ adjudge_to_death:  	/*	This is a (useful) BSD violating of the RFC. There is a  	 *	problem with TCP as specified in that the other end could  	 *	keep a socket open forever with no application left this end. -	 *	We use a 3 minute timeout (about the same as BSD) then kill +	 *	We use a 1 minute timeout (about the same as BSD) then kill  	 *	our end. If they send after that then tough - BUT: long enough  	 *	that we won't make the old 4*rto = almost no time - whoops  	 *	reset mistake. @@ -2300,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	sk->sk_shutdown = 0;  	sock_reset_flag(sk, SOCK_DONE); -	tp->srtt = 0; +	tp->srtt_us = 0;  	if ((tp->write_seq += tp->max_window + 2) == 0)  		tp->write_seq = 1;  	icsk->icsk_backoff = 0; @@ -2744,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)  	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;  	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; -	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; -	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; +	info->tcpi_rtt = tp->srtt_us >> 3; +	info->tcpi_rttvar = tp->mdev_us >> 2;  	info->tcpi_snd_ssthresh = tp->snd_ssthresh;  	info->tcpi_snd_cwnd = tp->snd_cwnd;  	info->tcpi_advmss = tp->advmss; @@ -2755,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)  	info->tcpi_rcv_space = tp->rcvq_space.space;  	info->tcpi_total_retrans = tp->total_retrans; + +	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? +					sk->sk_pacing_rate : ~0ULL; +	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? +					sk->sk_max_pacing_rate : ~0ULL;  }  EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2870,6 +2917,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  	case TCP_USER_TIMEOUT:  		val = jiffies_to_msecs(icsk->icsk_user_timeout);  		break; + +	case TCP_FASTOPEN: +		if (icsk->icsk_accept_queue.fastopenq != NULL) +			val = icsk->icsk_accept_queue.fastopenq->max_qlen; +		else +			val = 0; +		break; +  	case TCP_TIMESTAMP:  		val = tcp_time_stamp + tp->tsoffset;  		break; @@ -3097,13 +3152,13 @@ static int __init set_thash_entries(char *str)  }  __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void tcp_init_mem(void)  {  	unsigned long limit = nr_free_buffer_pages() / 8;  	limit = max(limit, 128UL); -	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; -	net->ipv4.sysctl_tcp_mem[1] = limit; -	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +	sysctl_tcp_mem[0] = limit / 4 * 3; +	sysctl_tcp_mem[1] = limit; +	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;  }  void __init tcp_init(void) @@ -3137,10 +3192,9 @@ void __init tcp_init(void)  					&tcp_hashinfo.ehash_mask,  					0,  					thash_entries ? 0 : 512 * 1024); -	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { +	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); -		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); -	} +  	if (inet_ehash_locks_alloc(&tcp_hashinfo))  		panic("TCP: failed to alloc ehash_locks");  	tcp_hashinfo.bhash = @@ -3166,7 +3220,7 @@ void __init tcp_init(void)  	sysctl_tcp_max_orphans = cnt / 2;  	sysctl_max_syn_backlog = max(128, cnt / 256); -	tcp_init_mem(&init_net); +	tcp_init_mem();  	/* Set per-socket limits to no more than 1/128 the pressure threshold */  	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);  	max_wshare = min(4UL*1024*1024, limit);  | 
