diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 892 |
1 files changed, 476 insertions, 416 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d451593ef1..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,8 +65,8 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ -EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -74,19 +74,21 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - /* Don't override Nagle indefinitely with F-RTO */ - if (tp->frto_counter == 2) - tp->frto_counter = 3; - tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || tp->early_retrans_delayed) + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); + } + + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, + tcp_skb_pcount(skb)); } /* SND.NXT, if window was not shrunk. @@ -164,6 +166,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -174,8 +177,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ @@ -185,6 +189,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } + +u32 tcp_default_init_rwnd(u32 mss) +{ + /* Initial receive window should be twice of TCP_INIT_CWND to + * enable proper sending of new unsent data during fast recovery + * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a + * limit when mss is larger than 1460. + */ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > 1460) + init_rwnd = max((1460 * init_rwnd) / mss, 2U); + return init_rwnd; +} + /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -234,22 +253,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to a value enough for senders starting with - * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place - * a limit on the initial window when mss is larger than 1460. - */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = TCP_DEFAULT_INIT_RCVWND; - if (mss > 1460) - init_cwnd = - max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - /* when initializing use the value from init_rcv_wnd - * rather than the default from above - */ - if (init_rcv_wnd) - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - else - *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); } /* Set the clamp no higher than max representable value */ @@ -265,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -277,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; @@ -294,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ - if (new_win == 0) + if (new_win == 0) { tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } return new_win; } @@ -314,7 +331,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } @@ -359,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -384,7 +403,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) -#define OPTION_COOKIE_EXTENSION (1 << 4) #define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { @@ -398,42 +416,12 @@ struct tcp_out_options { struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; -/* The sysctl int routines are generic, so check consistency here. - */ -static u8 tcp_cookie_size_check(u8 desired) -{ - int cookie_size; - - if (desired > 0) - /* previously specified */ - return desired; - - cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); - if (cookie_size <= 0) - /* no default specified */ - return 0; - - if (cookie_size <= TCP_COOKIE_MIN) - /* value too small, specify minimum */ - return TCP_COOKIE_MIN; - - if (cookie_size >= TCP_COOKIE_MAX) - /* value too large, specify maximum */ - return TCP_COOKIE_MAX; - - if (cookie_size & 1) - /* 8-bit multiple, illegal, fix it */ - cookie_size++; - - return (u8)cookie_size; -} - /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -446,27 +434,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, { u16 options = opts->options; /* mungable copy */ - /* Having both authentication and cookies for security is redundant, - * and there's certainly not enough room. Instead, the cookie-less - * extension variant is proposed. - * - * Consider the pessimal case with authentication. The options - * could look like: - * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 - */ if (unlikely(OPTION_MD5 & options)) { - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - *ptr++ = htonl((TCPOPT_COOKIE << 24) | - (TCPOLEN_COOKIE_BASE << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } else { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - } - options &= ~OPTION_COOKIE_EXTENSION; + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; @@ -495,44 +465,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - /* Specification requires after timestamp, so do it now. - * - * Consider the pessimal case without authentication. The options - * could look like: - * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 - */ - if (unlikely(OPTION_COOKIE_EXTENSION & options)) { - __u8 *cookie_copy = opts->hash_location; - u8 cookie_size = opts->hash_size; - - /* 8-bit multiple handled in tcp_cookie_size_check() above, - * and elsewhere. - */ - if (0x2 & cookie_size) { - __u8 *p = (__u8 *)ptr; - - /* 16-bit multiple */ - *p++ = TCPOPT_COOKIE; - *p++ = TCPOLEN_COOKIE_BASE + cookie_size; - *p++ = *cookie_copy++; - *p++ = *cookie_copy++; - ptr++; - cookie_size -= 2; - } else { - /* 32-bit multiple */ - *ptr++ = htonl(((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_COOKIE << 8) | - TCPOLEN_COOKIE_BASE) + - cookie_size); - } - - if (cookie_size > 0) { - memcpy(ptr, cookie_copy, cookie_size); - ptr += (cookie_size / 4); - } - } - if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -591,11 +523,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_cookie_values *cvp = tp->cookie_values; unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? - tcp_cookie_size_check(cvp->cookie_desired) : - 0; struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG @@ -622,7 +550,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,52 +575,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, tp->syn_fastopen = 1; } } - /* Note that timestamps are required by the specification. - * - * Odd numbers of bytes are prohibited by the specification, ensuring - * that the cookie is 16-bit aligned, and the resulting cookie pair is - * 32-bit aligned. - */ - if (*md5 == NULL && - (OPTION_TS & opts->options) && - cookie_size > 0) { - int need = TCPOLEN_COOKIE_BASE + cookie_size; - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - - if (need > remaining) { - /* try shrinking cookie to fit */ - cookie_size -= 2; - need -= 4; - } - } - while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { - cookie_size -= 4; - need -= 4; - } - if (TCP_COOKIE_MIN <= cookie_size) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; - opts->hash_size = cookie_size; - - /* Remember for future incarnations. */ - cvp->cookie_desired = cookie_size; - - if (cvp->cookie_desired != cvp->cookie_pair_size) { - /* Currently use random bytes as a nonce, - * assuming these are completely unpredictable - * by hostile users of the same system. - */ - get_random_bytes(&cvp->cookie_pair[0], - cookie_size); - cvp->cookie_pair_size = cookie_size; - } - remaining -= need; - } - } return MAX_TCP_OPTION_SPACE - remaining; } @@ -702,14 +585,10 @@ static unsigned int tcp_synack_options(struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5, - struct tcp_extend_values *xvp, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; - u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? - xvp->cookie_plus : - 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); @@ -748,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - if (foc != NULL) { + if (foc != NULL && foc->len >= 0) { u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { @@ -757,28 +636,7 @@ static unsigned int tcp_synack_options(struct sock *sk, remaining -= need; } } - /* Similar rationale to tcp_syn_options() applies here, too. - * If the <SYN> options fit, the same options should fit now! - */ - if (*md5 == NULL && - ireq->tstamp_ok && - cookie_plus > TCPOLEN_COOKIE_BASE) { - int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ - - if (0x2 & need) { - /* 32-bit multiple */ - need += 2; /* NOPs */ - } - if (need <= remaining) { - opts->options |= OPTION_COOKIE_EXTENSION; - opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; - remaining -= need; - } else { - /* There's no error return, so flag it. */ - xvp->cookie_out_never = 1; /* true */ - opts->hash_size = 0; - } - } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -794,6 +652,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb unsigned int size = 0; unsigned int eff_sacks; + opts->options = 0; + #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (unlikely(*md5)) { @@ -806,7 +666,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when : 0; + opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -834,7 +694,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. @@ -851,12 +711,13 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(unsigned long data) @@ -919,6 +780,17 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); + /* Here begins the tricky part : + * We are called from release_sock() with : + * 1) BH disabled + * 2) sk_lock.slock spinlock held + * 3) socket owned by us (sk->sk_lock.owned == 1) + * + * But following code is meant to be called from BH handlers, + * so we should keep BH disabled, but early release socket ownership + */ + sock_release_ownership(sk); + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); __sock_put(sk); @@ -950,10 +822,10 @@ void __init tcp_tasklet_init(void) /* * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ -static void tcp_wfree(struct sk_buff *skb) +void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); @@ -1005,19 +877,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); + if (clone_it) { + skb_mstamp_get(&skb->skb_mstamp); - if (likely(clone_it)) { if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -1032,19 +902,20 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) { + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - skb->ooo_okay = 1; - } else - skb->ooo_okay = 0; + + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; - skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? - tcp_wfree : sock_wfree; + skb->destructor = tcp_wfree; atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ @@ -1103,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1133,18 +1004,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || !sk_can_gso(sk) || - skb->ip_summed == CHECKSUM_NONE) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Make sure we own this skb before messing gso_size/gso_segs */ + WARN_ON_ONCE(skb_cloned(skb)); + + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; } else { - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); - skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type = sk->sk_gso_type; + shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + shinfo->gso_size = mss_now; + shinfo->gso_type = sk->sk_gso_type; } } @@ -1199,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -1214,13 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, gfp); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ @@ -1293,35 +1166,36 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, */ static void __pskb_trim_head(struct sk_buff *skb, int len) { + struct skb_shared_info *shinfo; int i, k, eat; eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; } eat = len; k = 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + shinfo = skb_shinfo(skb); + for (i = 0; i < shinfo->nr_frags; i++) { + int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + shinfo->frags[k] = shinfo->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + shinfo->frags[k].page_offset += eat; + skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } - skb_shinfo(skb)->nr_frags = k; + shinfo->nr_frags = k; skb_reset_tail_pointer(skb); skb->data_len -= len; @@ -1331,7 +1205,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; __pskb_trim_head(skb, len); @@ -1351,8 +1225,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1381,13 +1255,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { @@ -1502,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out >= tp->snd_cwnd) { + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } + + if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1522,23 +1431,51 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, max_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1551,7 +1488,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b if (max_len <= needed) return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1591,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1629,14 +1552,11 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (nonagle & TCP_NAGLE_PUSH) return true; - /* Don't use the nagle rule for urgent data (or for the final FIN). - * Nagle can be ignored during F-RTO too (see RFC4138). - */ - if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; @@ -1705,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now); + return tcp_fragment(sk, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) @@ -1748,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1779,7 +1700,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* If a full-sized TSO skb can be sent, do it. */ if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - sk->sk_gso_max_segs * tp->mss_cache)) + tp->xmit_size_goal_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1806,8 +1727,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); + + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; return true; @@ -1955,6 +1882,9 @@ static int tcp_mtu_probe(struct sock *sk) * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ @@ -1966,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1982,7 +1913,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -1990,8 +1920,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, goto repair; /* Skip network transmission */ cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) - break; + if (!cwnd_quota) { + is_cwnd_limited = true; + if (push_one == 2) + /* Force out a loss probe pkt. */ + cwnd_quota = 1; + else + break; + } if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; @@ -2002,23 +1938,42 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) break; } - /* TSQ : sk_wmem_alloc accounts skb truesize, - * including skb overhead. But thats OK. + /* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates + * Alas, some drivers / subsystems require a fair amount + * of queued bytes to ensure line rate. + * One example is wifi aggregation (802.11 AMPDU) */ - if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, + sk->sk_pacing_rate >> 10); + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } + limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs)); + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -2045,10 +2000,149 @@ repair: if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; - tcp_cwnd_validate(sk); + + /* Send one loss probe per tail loss episode. */ + if (push_one != 2) + tcp_schedule_loss_probe(sk); + tcp_cwnd_validate(sk, is_cwnd_limited); + return false; + } + return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) + return false; + /* No consecutive loss probes. */ + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { + tcp_rearm_rto(sk); + return false; + } + /* Don't do any loss probe on a Fast Open connection before 3WHS + * finishes. + */ + if (sk->sk_state == TCP_SYN_RECV) + return false; + + /* TLP is only scheduled when next timer event is RTO. */ + if (icsk->icsk_pending != ICSK_TIME_RETRANS) + return false; + + /* Schedule a loss probe in 2*RTT for SACK capable connections + * in Open state, that are either limited by cwnd or application. + */ + if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + return false; + + if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && + tcp_send_head(sk)) return false; + + /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + * for delayed ack when there's one outstanding packet. + */ + timeout = rtt << 1; + if (tp->packets_out == 1) + timeout = max_t(u32, timeout, + (rtt + (rtt >> 1) + TCP_DELACK_MAX)); + timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + + /* If RTO is shorter, just schedule TLP in its place. */ + tlp_time_stamp = tcp_time_stamp + timeout; + rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; + if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { + s32 delta = rto_time_stamp - tcp_time_stamp; + if (delta > 0) + timeout = delta; + } + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX); + return true; +} + +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; } - return !tp->packets_out && tcp_send_head(sk); + return false; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int pcount; + int mss = tcp_current_mss(sk); + int err = -1; + + if (tcp_send_head(sk) != NULL) { + err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + goto rearm_timer; + } + + /* At most one outstanding TLP retransmission. */ + if (tp->tlp_high_seq) + goto rearm_timer; + + /* Retransmit last segment. */ + skb = tcp_write_queue_tail(sk); + if (WARN_ON(!skb)) + goto rearm_timer; + + if (skb_still_in_host_queue(sk, skb)) + goto rearm_timer; + + pcount = tcp_skb_pcount(skb); + if (WARN_ON(!pcount)) + goto rearm_timer; + + if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + GFP_ATOMIC))) + goto rearm_timer; + skb = tcp_write_queue_tail(sk); + } + + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + + err = __tcp_retransmit_skb(sk, skb); + + /* Record snd_nxt for loss detection. */ + if (likely(!err)) + tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); + + if (likely(!err)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBES); } /* Push out any pending frames which were held back due to @@ -2146,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk) */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); - int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); int window; if (mss > full_space) @@ -2159,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - if (free_space < mss) + /* free_space might become our new window, make sure we don't + * increase it due to wscale. + */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + /* if free space is less than mss estimate, or is below 1/16th + * of the maximum allowed, try to move to zero-window, else + * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and + * new incoming data is dropped due to memory limits. + * With large window, mss test triggers way too late in order + * to announce zero window in time before rmem limit kicks in. + */ + if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } @@ -2314,6 +2421,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; + int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { @@ -2327,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + if (skb_still_in_host_queue(sk, skb)) + return -EBUSY; + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -2349,12 +2460,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } @@ -2362,35 +2475,34 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); - /* Some Solaris stacks overoptimize and ignore the FIN on a - * retransmit when old data is attached. So strip it off - * since it is cheap to do so and saves bytes on the network. - */ - if (skb->len > 0 && - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { - if (!pskb_trim(skb, 0)) { - /* Reuse, even though it does some unnecessary work */ - tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, - TCP_SKB_CB(skb)->tcp_flags); - skb->ip_summed = CHECKSUM_NONE; - } - } - /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - /* make sure skb->data is aligned on arches that require it */ - if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + /* make sure skb->data is aligned on arches that require it + * and check if ack-trimming & collapsing extended the headroom + * beyond what csum_start can cover. + */ + if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || + skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; } else { - return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + + if (likely(!err)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans++; + } + return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) @@ -2399,11 +2511,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { - /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - - tp->total_retrans++; - #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); @@ -2418,13 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } else if (err != -EBUSY) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } @@ -2545,10 +2656,9 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + if (tcp_retransmit_skb(sk, skb)) return; - } + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) @@ -2669,32 +2779,24 @@ int tcp_send_synack(struct sock *sk) * sk: listener socket * dst: dst entry attached to the SYNACK * req: request_sock pointer - * rvp: request_values pointer * * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp, struct tcp_fastopen_cookie *foc) { struct tcp_out_options opts; - struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - const struct tcp_cookie_values *cvp = tp->cookie_values; struct tcphdr *th; struct sk_buff *skb; struct tcp_md5sig_key *md5; int tcp_header_size; int mss; - int s_data_desired = 0; - if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) - s_data_desired = cvp->s_data_desired; - skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, - sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2703,32 +2805,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); skb_dst_set(skb, dst); + security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - - /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - } - memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) @@ -2736,9 +2818,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5, xvp, foc) - + sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2748,48 +2829,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = ireq->loc_port; - th->dest = ireq->rmt_port; + th->source = htons(ireq->ir_num); + th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPHDR_SYN | TCPHDR_ACK); - if (OPTION_COOKIE_EXTENSION & opts.options) { - if (s_data_desired) { - u8 *buf = skb_put(skb, s_data_desired); - - /* copy data directly from the listening socket. */ - memcpy(buf, cvp->s_data_payload, s_data_desired); - TCP_SKB_CB(skb)->end_seq += s_data_desired; - } - - if (opts.hash_size > 0) { - __u32 workspace[SHA_WORKSPACE_WORDS]; - u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; - u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; - - /* Secret recipe depends on the Timestamp, (future) - * Sequence and Acknowledgment Numbers, Initiator - * Cookie, and others handled by IP variant caller. - */ - *tail-- ^= opts.tsval; - *tail-- ^= tcp_rsk(req)->rcv_isn + 1; - *tail-- ^= TCP_SKB_CB(skb)->seq + 1; - - /* recommended */ - *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); - *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ - - sha_transform((__u32 *)&xvp->cookie_bakery[0], - (char *)mess, - &workspace[0]); - opts.hash_location = - (__u8 *)&xvp->cookie_bakery[0]; - } - } - th->seq = htonl(TCP_SKB_CB(skb)->seq); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); @@ -2798,7 +2845,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2813,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2872,6 +2919,8 @@ void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; @@ -2930,10 +2979,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) */ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; - space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; - syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + space = min_t(size_t, space, fo->size); + + /* limit to order-0 allocations */ + space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + + syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, sk->sk_allocation); if (syn_data == NULL) goto fallback; @@ -2963,9 +3017,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) tcp_connect_queue_skb(sk, data); fo->copied = data->len; + /* syn_data is about to be sent, we need to take current time stamps + * for the packets that are in write queue : SYN packet and DATA + */ + skb_mstamp_get(&syn->skb_mstamp); + data->skb_mstamp = syn->skb_mstamp; + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { tp->syn_data = (fo->copied > 0); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } syn_data = NULL; @@ -3053,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ - if (tp->srtt) { - int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + if (tp->srtt_us) { + int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), + TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -3152,7 +3213,6 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; tcp_xmit_probe_skb(sk, 0); } } @@ -3183,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) + if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); |
