diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1281 |
1 files changed, 700 insertions, 581 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b39f0d86e44..fa2c85ca5bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,6 +105,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -120,8 +121,7 @@ int sysctl_tcp_abc __read_mostly; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -132,7 +132,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ - len = skb_shinfo(skb)->gso_size ?: skb->len; + len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { @@ -172,8 +172,8 @@ static void tcp_incr_quickack(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); - if (quickacks==0) - quickacks=2; + if (quickacks == 0) + quickacks = 2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } @@ -198,7 +198,7 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags&TCP_ECN_OK) + if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -215,7 +215,7 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) { - if (tp->ecn_flags&TCP_ECN_OK) { + if (tp->ecn_flags & TCP_ECN_OK) { if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; /* Funny extension: if ECT is not set on a segment, @@ -228,19 +228,19 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return 1; return 0; } @@ -289,8 +289,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; + int truesize = tcp_win_from_space(skb->truesize) >> 1; + int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -302,8 +302,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -317,12 +316,13 @@ static void tcp_grow_window(struct sock *sk, * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) - incr = 2*tp->advmss; + incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, + tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -397,10 +397,9 @@ static void tcp_clamp_window(struct sock *sk) sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) - tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -413,7 +412,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); @@ -470,16 +469,15 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, - jiffies - tp->rcv_rtt_est.time, - 1); + tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && @@ -502,8 +500,7 @@ void tcp_rcv_space_adjust(struct sock *sk) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || - tp->rcv_rtt_est.rtt == 0) + if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); @@ -579,7 +576,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } else { int m = now - icsk->icsk_ack.lrcvtime; - if (m <= TCP_ATO_MIN/2) { + if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { @@ -591,7 +588,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -608,7 +605,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst->metrics[RTAX_RTO_MIN-1]; + rto_min = dst->metrics[RTAX_RTO_MIN - 1]; return rto_min; } @@ -671,14 +668,14 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ - tp->srtt = m<<3; /* take the measured time to be rtt */ - tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } @@ -732,7 +729,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); - if (dst && (dst->flags&DST_HOST)) { + if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; @@ -742,7 +739,7 @@ void tcp_update_metrics(struct sock *sk) * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) - dst->metrics[RTAX_RTT-1] = 0; + dst->metrics[RTAX_RTT - 1] = 0; return; } @@ -754,9 +751,9 @@ void tcp_update_metrics(struct sock *sk) */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) - dst->metrics[RTAX_RTT-1] = tp->srtt; + dst->metrics[RTAX_RTT - 1] = tp->srtt; else - dst->metrics[RTAX_RTT-1] -= (m>>3); + dst->metrics[RTAX_RTT - 1] -= (m >> 3); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { @@ -769,7 +766,7 @@ void tcp_update_metrics(struct sock *sk) m = tp->mdev; if (m >= dst_metric(dst, RTAX_RTTVAR)) - dst->metrics[RTAX_RTTVAR-1] = m; + dst->metrics[RTAX_RTTVAR - 1] = m; else dst->metrics[RTAX_RTTVAR-1] -= (dst->metrics[RTAX_RTTVAR-1] - m)>>2; @@ -783,7 +780,7 @@ void tcp_update_metrics(struct sock *sk) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; + dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ @@ -863,6 +860,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) */ static void tcp_disable_fack(struct tcp_sock *tp) { + /* RFC3517 uses different metric in lost marker => reset on change */ + if (tcp_is_fack(tp)) + tp->lost_skb_hint = NULL; tp->rx_opt.sack_ok &= ~2; } @@ -1112,16 +1112,22 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from - * SACK blocks by the caller). Also calculate the lowest snd_nxt among the - * remaining retransmitted skbs to avoid some costly processing per ACKs. + * highest SACK block). Also calculate the lowest snd_nxt among the remaining + * retransmitted skbs to avoid some costly processing per ACKs. */ -static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) +static void tcp_mark_lost_retrans(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int flag = 0; int cnt = 0; u32 new_low_seq = tp->snd_nxt; + u32 received_upto = tcp_highest_sack_seq(tp); + + if (!tcp_is_fack(tp) || !tp->retrans_out || + !after(received_upto, tp->lost_retrans_low) || + icsk->icsk_ca_state != TCP_CA_Recovery) + return; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; @@ -1149,9 +1155,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1161,8 +1166,6 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; - - return flag; } static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, @@ -1230,34 +1233,205 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, + int *reord, int dup_sack, int fack_count) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 sacked = TCP_SKB_CB(skb)->sacked; + int flag = 0; + + /* Account D-SACK for retransmitted packet. */ + if (dup_sack && (sacked & TCPCB_RETRANS)) { + if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + if (sacked & TCPCB_SACKED_ACKED) + *reord = min(fack_count, *reord); + } + + /* Nothing to do; acked frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + return flag; + + if (!(sacked & TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= + ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } else { + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + *reord = min(fack_count, *reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + flag |= FLAG_ONLY_ORIG_SACKED; + } + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + fack_count += tcp_skb_pcount(skb); + + /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ + if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_cnt_hint += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + + /* D-SACK. We can detect redundant retransmission in S|R and plain R + * frames and clear it. undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; + } + + return flag; +} + +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + struct tcp_sack_block *next_dup, + u32 start_seq, u32 end_seq, + int dup_sack_in, int *fack_count, + int *reord, int *flag) +{ + tcp_for_write_queue_from(skb, sk) { + int in_sack = 0; + int dup_sack = dup_sack_in; + + if (skb == tcp_send_head(sk)) + break; + + /* queue is in-order => we can short-circuit the walk early */ + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + if ((next_dup != NULL) && + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { + in_sack = tcp_match_skb_to_sack(sk, skb, + next_dup->start_seq, + next_dup->end_seq); + if (in_sack > 0) + dup_sack = 1; + } + + if (in_sack <= 0) + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, + end_seq); + if (unlikely(in_sack < 0)) + break; + + if (in_sack) + *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, + *fack_count); + + *fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +/* Avoid all extra work that is being done by sacktag while walking in + * a normal way + */ +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, + u32 skip_to_seq) +{ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + break; + } + return skb; +} + +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, + struct sock *sk, + struct tcp_sack_block *next_dup, + u32 skip_to_seq, + int *fack_count, int *reord, + int *flag) +{ + if (next_dup == NULL) + return skb; + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); + tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); + } + + return skb; +} + +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +{ + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); +} + static int -tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, + u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); - struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; - int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); + struct tcp_sack_block sp[4]; + struct tcp_sack_block *cache; + struct sk_buff *skb; + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; + int used_sacks; int reord = tp->packets_out; - int prior_fackets; - u32 highest_sack_end_seq = tp->lost_retrans_low; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; - int i; + int fack_count; + int i, j; int first_sack_index; - int force_one_sack; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; - tp->highest_sack = tp->snd_una; + tcp_highest_sack_reset(sk); } - prior_fackets = tp->fackets_out; - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1272,78 +1446,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!tp->packets_out) goto out; - /* SACK fastpath: - * if the only SACK change is the increase of the end_seq of - * the first block then only apply that SACK block - * and use retrans queue hinting otherwise slowpath */ - force_one_sack = 1; - for (i = 0; i < num_sacks; i++) { - __be32 start_seq = sp[i].start_seq; - __be32 end_seq = sp[i].end_seq; - - if (i == 0) { - if (tp->recv_sack_cache[i].start_seq != start_seq) - force_one_sack = 0; - } else { - if ((tp->recv_sack_cache[i].start_seq != start_seq) || - (tp->recv_sack_cache[i].end_seq != end_seq)) - force_one_sack = 0; - } - tp->recv_sack_cache[i].start_seq = start_seq; - tp->recv_sack_cache[i].end_seq = end_seq; - } - /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ - for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { - tp->recv_sack_cache[i].start_seq = 0; - tp->recv_sack_cache[i].end_seq = 0; - } - + used_sacks = 0; first_sack_index = 0; - if (force_one_sack) - num_sacks = 1; - else { - int j; - tp->fastpath_skb_hint = NULL; - - /* order SACK blocks to allow in order walk of the retrans queue */ - for (i = num_sacks-1; i > 0; i--) { - for (j = 0; j < i; j++){ - if (after(ntohl(sp[j].start_seq), - ntohl(sp[j+1].start_seq))){ - struct tcp_sack_block_wire tmp; - - tmp = sp[j]; - sp[j] = sp[j+1]; - sp[j+1] = tmp; - - /* Track where the first SACK block goes to */ - if (j == first_sack_index) - first_sack_index = j+1; - } - - } - } - } - - /* Use SACK fastpath hint if valid */ - cached_skb = tp->fastpath_skb_hint; - cached_fack_count = tp->fastpath_cnt_hint; - if (!cached_skb) { - cached_skb = tcp_write_queue_head(sk); - cached_fack_count = 0; - } - for (i = 0; i < num_sacks; i++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count; - int dup_sack = (found_dup_sack && (i == first_sack_index)); - int next_dup = (found_dup_sack && (i+1 == first_sack_index)); + int dup_sack = !i && found_dup_sack; - sp++; + sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq)); + sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (!tcp_is_sackblock_valid(tp, dup_sack, + sp[used_sacks].start_seq, + sp[used_sacks].end_seq)) { if (dup_sack) { if (!tp->undo_marker) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); @@ -1352,169 +1465,148 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && - !after(end_seq, tp->snd_una)) + !after(sp[used_sacks].end_seq, tp->snd_una)) continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); } + if (i == 0) + first_sack_index = -1; continue; } - skb = cached_skb; - fack_count = cached_fack_count; - - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - flag |= FLAG_DATA_LOST; - - tcp_for_write_queue_from(skb, sk) { - int in_sack = 0; - u8 sacked; - - if (skb == tcp_send_head(sk)) - break; - - cached_skb = skb; - cached_fack_count = fack_count; - if (i == first_sack_index) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint = fack_count; - } + /* Ignore very old stuff early */ + if (!after(sp[used_sacks].end_seq, prior_snd_una)) + continue; - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if (!before(TCP_SKB_CB(skb)->seq, end_seq)) - break; + used_sacks++; + } - dup_sack = (found_dup_sack && (i == first_sack_index)); + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = used_sacks - 1; i > 0; i--) { + for (j = 0; j < i; j++) { + if (after(sp[j].start_seq, sp[j + 1].start_seq)) { + struct tcp_sack_block tmp; - /* Due to sorting DSACK may reside within this SACK block! */ - if (next_dup) { - u32 dup_start = ntohl(sp->start_seq); - u32 dup_end = ntohl(sp->end_seq); + tmp = sp[j]; + sp[j] = sp[j + 1]; + sp[j + 1] = tmp; - if (before(TCP_SKB_CB(skb)->seq, dup_end)) { - in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); - if (in_sack > 0) - dup_sack = 1; - } + /* Track where the first SACK block goes to */ + if (j == first_sack_index) + first_sack_index = j + 1; } + } + } - /* DSACK info lost if out-of-mem, try SACK still */ - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); - if (unlikely(in_sack < 0)) - break; + skb = tcp_write_queue_head(sk); + fack_count = 0; + i = 0; - sacked = TCP_SKB_CB(skb)->sacked; + if (!tp->sacked_out) { + /* It's already past, so skip checking against it */ + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); + } else { + cache = tp->recv_sack_cache; + /* Skip empty blocks in at head of the cache */ + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && + !cache->end_seq) + cache++; + } - /* Account D-SACK for retransmitted packet. */ - if ((dup_sack && in_sack) && - (sacked & TCPCB_RETRANS) && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) - tp->undo_retrans--; + while (i < used_sacks) { + u32 start_seq = sp[i].start_seq; + u32 end_seq = sp[i].end_seq; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + struct tcp_sack_block *next_dup = NULL; - /* The frame is ACKed. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { - if (sacked&TCPCB_RETRANS) { - if ((dup_sack && in_sack) && - (sacked&TCPCB_SACKED_ACKED)) - reord = min(fack_count, reord); - } + if (found_dup_sack && ((i + 1) == first_sack_index)) + next_dup = &sp[i + 1]; - /* Nothing to do; acked frame is about to be dropped. */ - fack_count += tcp_skb_pcount(skb); - continue; - } + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; - if (!in_sack) { - fack_count += tcp_skb_pcount(skb); - continue; + /* Skip too early cached blocks */ + while (tcp_sack_cache_ok(tp, cache) && + !before(start_seq, cache->end_seq)) + cache++; + + /* Can skip some work by looking recv_sack_cache? */ + if (tcp_sack_cache_ok(tp, cache) && !dup_sack && + after(end_seq, cache->start_seq)) { + + /* Head todo? */ + if (before(start_seq, cache->start_seq)) { + skb = tcp_sacktag_skip(skb, sk, start_seq); + skb = tcp_sacktag_walk(skb, sk, next_dup, + start_seq, + cache->start_seq, + dup_sack, &fack_count, + &reord, &flag); } - if (!(sacked&TCPCB_SACKED_ACKED)) { - if (sacked & TCPCB_SACKED_RETRANS) { - /* If the segment is not tagged as lost, - * we do not clear RETRANS, believing - * that retransmission is still in flight. - */ - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } else { - if (!(sacked & TCPCB_RETRANS)) { - /* New sack for not retransmitted frame, - * which was in hole. It is reordering. - */ - if (fack_count < prior_fackets) - reord = min(fack_count, reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) - flag |= FLAG_ONLY_ORIG_SACKED; - } - - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); + /* Rest of the block already fully processed? */ + if (!after(end_seq, cache->end_seq)) + goto advance_sp; - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, + cache->end_seq, + &fack_count, &reord, + &flag); - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); - - fack_count += tcp_skb_pcount(skb); - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { - tp->highest_sack = TCP_SKB_CB(skb)->seq; - highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; - } - } else { - if (dup_sack && (sacked&TCPCB_RETRANS)) - reord = min(fack_count, reord); - - fack_count += tcp_skb_pcount(skb); + /* ...tail remains todo... */ + if (tcp_highest_sack_seq(tp) == cache->end_seq) { + /* ...but better entrypoint exists! */ + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; + cache++; + goto walk; } - /* D-SACK. We can detect redundant retransmission - * in S|R and plain R frames and clear it. - * undo_retrans is decreased above, L|R frames - * are accounted above as well. - */ - if (dup_sack && - (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; - } + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); + /* Check overlap against next cached too (past this one already) */ + cache++; + continue; + } + + if (!before(start_seq, tcp_highest_sack_seq(tp))) { + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; } + skb = tcp_sacktag_skip(skb, sk, start_seq); + +walk: + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, + dup_sack, &fack_count, &reord, &flag); +advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) flag &= ~FLAG_ONLY_ORIG_SACKED; + + i++; } - if (tp->retrans_out && - after(highest_sack_end_seq, tp->lost_retrans_low) && - icsk->icsk_ca_state == TCP_CA_Recovery) - flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); + /* Clear the head of the cache sack blocks so we can skip it next time */ + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { + tp->recv_sack_cache[i].start_seq = 0; + tp->recv_sack_cache[i].end_seq = 0; + } + for (j = 0; j < used_sacks; j++) + tp->recv_sack_cache[i++] = sp[j]; + + tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + if ((reord < tp->fackets_out) && + ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, tp->fackets_out - reord, 0); @@ -1565,10 +1657,10 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) + if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else - tp->sacked_out -= acked-1; + tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); @@ -1602,10 +1694,10 @@ int tcp_use_frto(struct sock *sk) tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } return 1; @@ -1715,7 +1807,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... */ - if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { /* For some reason this R-bit might get cleared? */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out += tcp_skb_pcount(skb); @@ -1728,7 +1820,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1743,7 +1835,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); @@ -1810,7 +1902,7 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { @@ -1822,7 +1914,7 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); @@ -1830,18 +1922,15 @@ void tcp_enter_loss(struct sock *sk, int how) tp->frto_counter = 0; } -static int tcp_check_sack_reneging(struct sock *sk) +/* If ACK arrived pointing to a remembered SACK, it means that our + * remembered SACKs do not reflect real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * + * Do processing similar to RTO timeout. + */ +static int tcp_check_sack_reneging(struct sock *sk, int flag) { - struct sk_buff *skb; - - /* If ACK arrived pointing to a remembered SACK, - * it means that our remembered SACKs do not reflect - * real state of receiver i.e. - * receiver _host_ is heavily congested (or buggy). - * Do processing similar to RTO timeout. - */ - if ((skb = tcp_write_queue_head(sk)) != NULL && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); @@ -1857,7 +1946,27 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; +} + +/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs + * counter when SACK is enabled (without SACK, sacked_out is used for + * that purpose). + * + * Instead, with FACK TCP uses fackets_out that includes both SACKed + * segments up to the highest received SACK block so far and holes in + * between them. + * + * With reordering, holes may still be in flight, so RFC3517 recovery + * uses pure sacked_out (total number of SACKed segments) even though + * it violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, + * they differ. Since neither occurs due to loss, TCP should really + * ignore them. + */ +static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +{ + return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1980,13 +2089,13 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_fackets_out(tp) > tp->reordering) + if (tcp_dupack_heurestics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk)) + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -2010,17 +2119,18 @@ static int tcp_time_to_recover(struct sock *sk) * retransmitted past LOST markings in the first place? I'm not fully sure * about undo and end of connection cases, which can cause R without L? */ -static void tcp_verify_retransmit_hint(struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((tp->retransmit_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) |