diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 1055 |
1 files changed, 532 insertions, 523 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c6225780bd..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ -static void tcp_fixup_sndbuf(struct sock *sk) +static void tcp_sndbuf_expand(struct sock *sk) { - int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); + const struct tcp_sock *tp = tcp_sk(sk); + int sndmem, per_mss; + u32 nr_segs; + + /* Worst case is non GSO/TSO : each frame consumes one skb + * and skb->head is kmalloced using power of two area of memory + */ + per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + + MAX_TCP_HEADER + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + per_mss = roundup_pow_of_two(per_mss) + + SKB_DATA_ALIGN(sizeof(struct sk_buff)); + + nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); + nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + + /* Fast Recovery (RFC 5681 3.2) : + * Cubic needs 1.7 factor, rounded to 2 to include + * extra cushion (application might react slowly to POLLOUT) + */ + sndmem = 2 * nr_segs * per_mss; - sndmem *= TCP_INIT_CWND; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); } @@ -347,24 +367,19 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) } /* 3. Tuning rcvbuf, when connection enters established state. */ - static void tcp_fixup_rcvbuf(struct sock *sk) { u32 mss = tcp_sk(sk)->advmss; - u32 icwnd = TCP_DEFAULT_INIT_RCVWND; int rcvmem; - /* Limit to 10 segments if mss <= 1460, - * or 14600/mss segments, with a minimum of two segments. - */ - if (mss > 1460) - icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - - rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < mss) - rcvmem += 128; + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * + tcp_default_init_rwnd(mss); - rcvmem *= icwnd; + /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency + * Allow enough cushion so that sender is not limited by our window + */ + if (sysctl_tcp_moderate_rcvbuf) + rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); @@ -381,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) - tcp_fixup_sndbuf(sk); + tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; + tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -523,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; - int space; - - if (tp->rcvq_space.time == 0) - goto new_measure; + int copied; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; - space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + /* Number of bytes copied to user in last RTT */ + copied = tp->copied_seq - tp->rcvq_space.seq; + if (copied <= tp->rcvq_space.space) + goto new_measure; - space = max(tp->rcvq_space.space, space); + /* A bit of theory : + * copied = bytes received in previous RTT, our base window + * To cope with packet losses, we need a 2x factor + * To cope with slow start, and sender growing its cwin by 100 % + * every RTT, we need a 4x factor, because the ACK we are sending + * now is for the next RTT, not the current one : + * <prev RTT . ><current RTT .. ><next RTT .... > + */ - if (tp->rcvq_space.space != space) { - int rcvmem; + if (sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvwin, rcvmem, rcvbuf; - tp->rcvq_space.space = space; + /* minimal window to cope with packet losses, assuming + * steady state. Add some cushion because of small variations. + */ + rcvwin = (copied << 1) + 16 * tp->advmss; - if (sysctl_tcp_moderate_rcvbuf && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int new_clamp = space; + /* If rate increased by 25%, + * assume slow start, rcvwin = 3 * copied + * If rate increased by 50%, + * assume sender can use 2x growth, rcvwin = 4 * copied + */ + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) + rcvwin <<= 1; + else + rcvwin += (rcvwin >> 1); + } - /* Receive space grows, normalize in order to - * take into account packet headers and sk_buff - * structure overhead. - */ - space /= tp->advmss; - if (!space) - space = 1; - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) - rcvmem += 128; - space *= rcvmem; - space = min(space, sysctl_tcp_rmem[2]); - if (space > sk->sk_rcvbuf) { - sk->sk_rcvbuf = space; - - /* Make the window clamp follow along. */ - tp->window_clamp = new_clamp; - } + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + + rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + if (rcvbuf > sk->sk_rcvbuf) { + sk->sk_rcvbuf = rcvbuf; + + /* Make the window clamp follow along. */ + tp->window_clamp = rcvwin; } } + tp->rcvq_space.space = copied; new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -636,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -657,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -676,33 +706,62 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } + tp->srtt_us = max(1U, srtt); +} + +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); + + /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + * without any lock. We want to make sure compiler wont store + * intermediate values in this location. + */ + ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, + sk->sk_max_pacing_rate); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1047,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1056,9 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1102,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1119,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - bool dup_sack, int pcount) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1159,6 +1220,14 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; + /* Pick the earliest sequence sacked for RTT */ + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1216,7 +1285,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount); + start_seq, end_seq, dup_sack, pcount, + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1257,14 +1327,15 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->scoreboard_skb_hint) - tp->scoreboard_skb_hint = prev; if (skb == tp->lost_skb_hint) { tp->lost_skb_hint = prev; tp->lost_cnt_hint -= tcp_skb_pcount(prev); } - TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; + TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + TCP_SKB_CB(prev)->end_seq++; + if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); @@ -1492,7 +1563,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, - tcp_skb_pcount(skb)); + tcp_skb_pcount(skb), + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1549,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una) + u32 prior_snd_una, long *sack_rtt_us) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1567,6 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1750,6 +1823,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -1819,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1870,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -1882,8 +1957,13 @@ void tcp_enter_loss(struct sock *sk, int how) } tcp_verify_left_out(tp); - tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + /* Timeout in disordered state after receiving substantial DUPACKs + * suggests that the degree of reordering is over-estimated. + */ + if (icsk->icsk_ca_state <= TCP_CA_Disorder && + tp->sacked_out >= sysctl_tcp_reordering) + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); @@ -1954,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * available, or RTO is scheduled to fire first. */ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt) + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; @@ -1966,20 +2048,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) return true; } -static inline int tcp_skb_timedout(const struct sock *sk, - const struct sk_buff *skb) -{ - return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; -} - -static inline int tcp_head_timedout(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->packets_out && - tcp_skb_timedout(sk, tcp_write_queue_head(sk)); -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2086,12 +2154,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#3 : when we use RFC2988 timer restart, fast - * retransmit can be triggered by timeout of queue head. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return true; - /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ @@ -2128,44 +2190,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return false; } -/* New heuristics: it is possible only after we switched to restart timer - * each time when something is ACKed. Hence, we can detect timed out packets - * during fast retransmit without falling to slow start. - * - * Usefulness of this as is very questionable, since we should know which of - * the segments is the next to timeout which is relatively expensive to find - * in general case unless we add some data structure just for that. The - * current approach certainly won't find the right one too often and when it - * finally does find _something_ it usually marks large part of the window - * right away (because a retransmission with a larger timestamp blocks the - * loop from advancing). -ij - */ -static void tcp_timeout_skbs(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) - return; - - skb = tp->scoreboard_skb_hint; - if (tp->scoreboard_skb_hint == NULL) - skb = tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); -} - /* Detect loss in event "A" above by marking head of queue up as lost. * For FACK or non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it @@ -2217,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; @@ -2251,8 +2276,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) else if (fast_rexmit) tcp_mark_head_lost(sk, 1, 1); } - - tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2307,10 +2330,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) +static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); + if (unmark_loss) { + struct sk_buff *skb; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + tp->lost_out = 0; + tcp_clear_all_retrans_hints(tp); + } + if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2319,7 +2354,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { + if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } @@ -2327,6 +2362,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; + tp->undo_marker = 0; } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2346,14 +2382,13 @@ static bool tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, true); + tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2367,16 +2402,17 @@ static bool tcp_try_undo_recovery(struct sock *sk) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk) +static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, true); - tp->undo_marker = 0; + tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + return true; } + return false; } /* We can clear retrans_stamp when there are no retransmissions in the @@ -2408,60 +2444,20 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } -/* Undo during fast recovery after partial ACK. */ - -static int tcp_try_undo_partial(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); - - if (tcp_may_undo(tp)) { - /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. - */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; - - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - - DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); - - /* So... Do not make Hoe's retransmit yet. - * If the first packet was delayed, the rest - * ones are most probably delayed as well. - */ - failed = 0; - } - return failed; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { - struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - } - - tcp_clear_all_retrans_hints(tp); + tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); - tp->lost_out = 0; - tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - tp->undo_marker = 0; if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return true; @@ -2494,12 +2490,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) TCP_ECN_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, +static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + int newly_acked_sacked = prior_unsacked - + (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { @@ -2556,7 +2554,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) +static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2570,10 +2568,8 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) - tcp_moderate_cwnd(tp); } else { - tcp_cwnd_reduction(sk, newly_acked_sacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0); } } @@ -2669,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) @@ -2689,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2731,6 +2726,40 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tcp_xmit_retransmit_queue(sk); } +/* Undo during fast recovery after partial ACK. */ +static bool tcp_try_undo_partial(struct sock *sk, const int acked, + const int prior_unsacked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->undo_marker && tcp_packet_delayed(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + /* We are getting evidence that the reordering degree is higher + * than we realized. If there are no retransmits out then we + * can undo. Otherwise we clock out new packets but do not + * mark more packets lost or retransmit more. + */ + if (tp->retrans_out) { + tcp_cwnd_reduction(sk, prior_unsacked, 0); + return true; + } + + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + DBGUNDO(sk, "partial recovery"); + tcp_undo_cwnd_reduction(sk, true); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + tcp_try_keep_open(sk); + return true; + } + return false; +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2742,15 +2771,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int prior_sacked, int prior_packets, +static void tcp_fastretrans_alert(struct sock *sk, const int acked, + const int prior_unsacked, bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int newly_acked_sacked = 0; int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2802,10 +2830,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else - do_lost = tcp_try_undo_partial(sk, pkts_acked); - newly_acked_sacked = prior_packets - tp->packets_out + - tp->sacked_out - prior_sacked; + } else { + if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + return; + /* Partial ACK arrived. Force fast retransmit. */ + do_lost = tcp_is_reno(tp) || + tcp_fackets_out(tp) > tp->reordering; + } + if (tcp_try_undo_dsack(sk)) { + tcp_try_keep_open(sk); + return; + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); @@ -2819,14 +2854,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } - newly_acked_sacked = prior_packets - tp->packets_out + - tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, newly_acked_sacked); + tcp_try_to_open(sk, flag, prior_unsacked); return; } @@ -2846,77 +2879,70 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, fast_rexmit = 1; } - if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, + long seq_rtt_us, long sack_rtt_us) { - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); + const struct tcp_sock *tp = tcp_sk(sk); + + /* Prefer RTT measured from ACK's timing to TS-ECR. This is because + * broken middle-boxes or peers may corrupt TS-ECR fields. But + * Karn's algorithm forbids taking RTT if some retransmitted data + * is acked (RFC6298). + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + seq_rtt_us = -1L; + + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{ /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. - * * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> - * - * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overestimated rto. With timestamps - * samples are accepted even from very old segments: f.e., when rtt=1 - * increases to 8, we retransmit 5 times and after 8 seconds delayed - * answer arrives rto becomes 120 seconds! If at least one of segments - * in window is lost... Voila. --ANK (010210) */ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + flag & FLAG_ACKED) + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ + if (seq_rtt_us < 0) + return false; - if (flag & FLAG_RETRANS_DATA_ACKED) - return; + tcp_rtt_estimator(sk, seq_rtt_us); + tcp_set_rto(sk); - tcp_valid_rtt_meas(sk, seq_rtt); + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; + return true; } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt) +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { - const struct tcp_sock *tp = tcp_sk(sk); - /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + struct tcp_sock *tp = tcp_sk(sk); + long seq_rtt_us = -1L; + + if (synack_stamp && !tp->total_retrans) + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + + /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets + * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() + */ + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2999,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; + bool fully_acked = true; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; struct sk_buff *skb; - u32 now = tcp_time_stamp; - int fully_acked = true; - int flag = 0; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); + bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3038,14 +3066,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) @@ -3079,7 +3105,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; if (skb == tp->lost_skb_hint) @@ -3092,18 +3117,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); + if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); } - tcp_ack_update_rtt(sk, flag, seq_rtt); - tcp_rearm_rto(sk); - if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { @@ -3120,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. + */ + tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 @@ -3191,11 +3215,22 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } +/* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - const struct tcp_sock *tp = tcp_sk(sk); - return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !tcp_in_cwnd_reduction(sk); + if (tcp_in_cwnd_reduction(sk)) + return false; + + /* If reordering is high then always grow cwnd whenever data is + * delivered regardless of its ordering. Otherwise stay conservative + * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ + * new SACK or ECE mark may first advance cwnd here and later reduce + * cwnd in tcp_fastretrans_alert() based on more states. + */ + if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. @@ -3314,7 +3349,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_init_cwnd_reduction(sk, true); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_Open); + tcp_try_keep_open(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } @@ -3330,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight; u32 prior_fackets; int prior_packets = tp->packets_out; - int prior_sacked = tp->sacked_out; - int pkts_acked = 0; - int previous_packets_out = 0; + const int prior_unsacked = tp->packets_out - tp->sacked_out; + int acked = 0; /* Number of packets newly acked */ + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3363,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= FLAG_SND_UNA_ADVANCED; prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3392,7 +3425,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt_us); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3410,23 +3444,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - previous_packets_out = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + acked = tp->packets_out; + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); + acked -= tp->packets_out; - pkts_acked = previous_packets_out - tp->packets_out; + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { - /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); - } else { - if (flag & FLAG_DATA_ACKED) - tcp_cong_avoid(sk, ack, prior_in_flight); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); } - if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3438,13 +3469,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3465,9 +3497,10 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt_us); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3598,7 +3631,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + if (*ptr) + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + else + tp->rx_opt.rcv_tsecr = 0; return true; } return false; @@ -3623,7 +3659,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, } tcp_parse_options(skb, &tp->rx_opt, 1, NULL); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; @@ -3646,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -3780,6 +3816,7 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3791,7 +3828,9 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + dst = __sk_dst_get(sk); + if (!dst || !dst_metric(dst, RTAX_QUICKACK)) + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -4003,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; @@ -4159,6 +4198,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4234,8 +4274,10 @@ add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: - if (skb) + if (skb) { + tcp_grow_window(sk, skb); skb_set_owner_r(skb, sk); + } } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, @@ -4370,7 +4412,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4660,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4718,15 +4738,7 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { - int sndmem = SKB_TRUESIZE(max_t(u32, - tp->rx_opt.mss_clamp, - tp->mss_cache) + - MAX_TCP_HEADER); - int demanded = max_t(unsigned int, tp->snd_cwnd, - tp->reordering + 1); - sndmem *= 2 * demanded; - if (sndmem > sk->sk_sndbuf) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -4879,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -4965,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5070,8 +5082,8 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); @@ -5148,7 +5160,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); - return 0; + return; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -5240,8 +5252,8 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); - return 0; + sk->sk_data_ready(sk); + return; } } @@ -5257,7 +5269,7 @@ slow_path: */ if (!tcp_validate_incoming(sk, skb, th, 1)) - return 0; + return; step5: if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) @@ -5273,7 +5285,7 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - return 0; + return; csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); @@ -5281,7 +5293,6 @@ csum_error: discard: __kfree_skb(skb); - return 0; } EXPORT_SYMBOL(tcp_rcv_established); @@ -5361,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); return false; } @@ -5376,7 +5390,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { @@ -5601,6 +5615,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; + bool acceptable; + u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5671,157 +5687,150 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; /* step 5: check the ACK field */ - if (true) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; - - switch (sk->sk_state) { - case TCP_SYN_RECV: - if (acceptable) { - /* Once we leave TCP_SYN_RECV, we no longer - * need req so release it. - */ - if (req) { - tcp_synack_rtt_meas(sk, req); - tp->total_retrans = req->num_retrans; - - reqsk_fastopen_remove(sk, req, false); - } else { - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_congestion_control(sk); + acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; - tcp_mtup_init(sk); - tcp_init_buffer_space(sk); - tp->copied_seq = tp->rcv_nxt; - } - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - - /* Note, that this wakeup is only for marginal - * crossed SYN case. Passively open sockets - * are not waked up, because sk->sk_sleep == - * NULL and sk->sk_socket == NULL. - */ - if (sk->sk_socket) - sk_wake_async(sk, - SOCK_WAKE_IO, POLL_OUT); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = ntohs(th->window) << - tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - - if (req) { - /* Re-arm the timer because data may - * have been sent out. This is similar - * to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more - * aggressive and retranmitting any data - * sooner based on when they were sent - * out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + switch (sk->sk_state) { + case TCP_SYN_RECV: + if (!acceptable) + return 1; - /* Prevent spurious tcp_cwnd_restart() on - * first data packet. - */ - tp->lsndtime = tcp_time_stamp; + /* Once we leave TCP_SYN_RECV, we no longer need req + * so release it. + */ + if (req) { + synack_stamp = tcp_rsk(req)->snt_synack; + tp->total_retrans = req->num_retrans; + reqsk_fastopen_remove(sk, req, false); + } else { + synack_stamp = tp->lsndtime; + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); + + tcp_mtup_init(sk); + tp->copied_seq = tp->rcv_nxt; + tcp_init_buffer_space(sk); + } + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); - tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); - } else { - return 1; - } - break; + /* Note, that this wakeup is only for marginal crossed SYN case. + * Passively open sockets are not waked up, because + * sk->sk_sleep == NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_synack_rtt_meas(sk, synack_stamp); - case TCP_FIN_WAIT1: - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + if (req) { + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. */ - if (req != NULL) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) - return 1; - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } - if (tp->snd_una == tp->write_seq) { - struct dst_entry *dst; - - tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; - - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - - if (!sock_flag(sk, SOCK_DEAD)) - /* Wake up lingering close() */ - sk->sk_state_change(sk); - else { - int tmo; - - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; - } + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, - * if it spins in bh_lock_sock(), but it is really - * marginal case. - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } - } - } - break; + tcp_update_pacing_rate(sk); - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; - } + /* Prevent spurious tcp_cwnd_restart() on first data packet */ + tp->lsndtime = tcp_time_stamp; + + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); + break; + + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + + /* If we enter the TCP_FIN_WAIT1 state and we are a + * Fast Open socket and this is the first acceptable + * ACK we have received, this would have acknowledged + * our SYNACK so stop the SYNACK timer. + */ + if (req != NULL) { + /* Return RST if ack_seq is invalid. + * Note that RFC793 only says to generate a + * DUPACK for it but for TCP Fast Open it seems + * better to treat this case like TCP_SYN_RECV + * above. + */ + if (!acceptable) + return 1; + /* We no longer need the request sock. */ + reqsk_fastopen_remove(sk, req, false); + tcp_rearm_rto(sk); + } + if (tp->snd_una != tp->write_seq) break; - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - tcp_update_metrics(sk); - tcp_done(sk); - goto discard; - } + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + + if (!sock_flag(sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + sk->sk_state_change(sk); break; } + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + break; + } + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; } /* step 6: check the URG bit */ |
