diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-10 21:24:21 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-10 21:24:21 -0800 |
commit | 79ffeeb9e66da8c60de8c8ab676658bcbc47c1f7 (patch) | |
tree | f3c3841e1d4b5c7cd3695fe34ff23e0be08d7dac | |
parent | a5aac37f1cdbbd1e587fc618e778ddae124e5ac3 (diff) | |
parent | 6a438bbe68c7013a42d9c5aee5a40d7dafdbe6ec (diff) |
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 5 | ||||
-rw-r--r-- | include/linux/sysctl.h | 1 | ||||
-rw-r--r-- | include/linux/tcp.h | 16 | ||||
-rw-r--r-- | include/net/sock.h | 6 | ||||
-rw-r--r-- | include/net/tcp.h | 71 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 40 | ||||
-rw-r--r-- | net/ipv4/tcp_highspeed.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_hybla.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 288 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_vegas.c | 42 |
19 files changed, 413 insertions, 199 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 65895bb5141..ebc09a159f6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER TCP variables: +tcp_abc - INTEGER + Controls Appropriate Byte Count defined in RFC3465. If set to + 0 then does congestion avoid once per ack. 1 is conservative + value, and 2 is more agressive. + tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt will be retransmitted. Should not be higher than 255. Default value diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 22cf5e1ac98..ab2791b3189 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -390,6 +390,7 @@ enum NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, + NET_TCP_ABC=111, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ac4ca44c75c..0e1da6602e0 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -307,6 +307,21 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ + struct tcp_sack_block recv_sack_cache[4]; + + /* from STCP, retrans queue hinting */ + struct sk_buff* lost_skb_hint; + + struct sk_buff *scoreboard_skb_hint; + struct sk_buff *retransmit_skb_hint; + struct sk_buff *forward_skb_hint; + struct sk_buff *fastpath_skb_hint; + + int fastpath_cnt_hint; + int lost_cnt_hint; + int retransmit_cnt_hint; + int forward_cnt_hint; + __u16 advmss; /* Advertised MSS */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u32 lost_out; /* Lost packets */ @@ -326,6 +341,7 @@ struct tcp_sock { __u32 snd_up; /* Urgent pointer */ __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ diff --git a/include/net/sock.h b/include/net/sock.h index ff13c4cc287..982b4ecd187 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ skb = skb->next) +/*from STCP for fast SACK Process*/ +#define sk_stream_for_retrans_queue_from(skb, sk) \ + for (; (skb != (sk)->sk_send_head) && \ + (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ + skb = skb->next) + /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 96cc3b434e4..0f984801197 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a - * connection: ~180sec is RFC minumum */ + * connection: ~180sec is RFC minimum */ #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a - * connection: ~180sec is RFC minumum */ + * connection: ~180sec is RFC minimum */ #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned @@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ -#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ +#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ extern struct inet_timewait_death_row tcp_death_row; @@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; +extern int sysctl_tcp_abc; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk); /* TCP timestamps are only 32-bits, this causes a slight * complication on 64-bit systems since we store a snapshot - * of jiffies in the buffer control blocks below. We decidely + * of jiffies in the buffer control blocks below. We decidedly * only use of the low 32-bits of jiffies and hide the ugly * casts with the following macro. */ #define tcp_time_stamp ((__u32)(jiffies)) -/* This is what the send packet queueing engine uses to pass +/* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission * code. We also store the host-order sequence numbers in * here too. This is 36 bytes on 32-bit architectures, @@ -597,7 +598,7 @@ struct tcp_skb_cb { #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) -#define TCPCB_URG 0x20 /* Urgent pointer advenced here */ +#define TCPCB_URG 0x20 /* Urgent pointer advanced here */ #define TCPCB_AT_TAIL (TCPCB_URG) @@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) (tp->snd_cwnd >> 2))); } +/* + * Linear increase during slow start + */ +static inline void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} + + static inline void tcp_sync_left_out(struct tcp_sock *tp) { if (tp->rx_opt.sack_ok && @@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; + tp->bytes_acked = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { __tcp_enter_cwr(sk); tcp_set_ca_state(sk, TCP_CA_CWR); @@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) return 3; } +/* RFC2861 Check whether we are limited by application or congestion window + * This is the inverse of cwnd check in tcp_tso_should_defer + */ +static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 left; + + if (in_flight >= tp->snd_cwnd) + return 1; + + if (!(sk->sk_route_caps & NETIF_F_TSO)) + return 0; + + left = tp->snd_cwnd - in_flight; + if (sysctl_tcp_tso_win_divisor) + return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; + else + return left <= tcp_max_burst(tp); +} + static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, const struct sk_buff *skb) { @@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void) TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); } +/*from STCP */ +static inline void clear_all_retrans_hints(struct tcp_sock *tp){ + tp->lost_skb_hint = NULL; + tp->scoreboard_skb_hint = NULL; + tp->retransmit_skb_hint = NULL; + tp->forward_skb_hint = NULL; + tp->fastpath_skb_hint = NULL; +} + /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea..9ac7a4f46bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e060904..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..82b3c189bd7 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, u32 pkts_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..40a26b7157b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. @@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smoother things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased fastly, decrease too fastly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; i<num_sacks; i++, sp++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); |