aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 17:26:31 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 17:26:31 -0700
commita39451c17f53bbae053555670c7b678d46bcebba (patch)
tree8bb6106ec7812a421c3d3eb9c8c273580cc703fc
parentadb7ee3746b579a7fa7af7c4ec2c8164bc910ed4 (diff)
parent0e57976b6376f7fda6bef8b7dee2a3c8819ec9e9 (diff)
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
-rw-r--r--Documentation/networking/ip-sysctl.txt56
-rw-r--r--Documentation/networking/tcp.txt69
-rw-r--r--include/linux/sysctl.h9
-rw-r--r--include/linux/tcp.h49
-rw-r--r--include/linux/tcp_diag.h4
-rw-r--r--include/net/tcp.h237
-rw-r--r--net/ipv4/Kconfig90
-rw-r--r--net/ipv4/Makefile10
-rw-r--r--net/ipv4/sysctl_net_ipv4.c114
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c195
-rw-r--r--net/ipv4/tcp_diag.c34
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c737
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv6/tcp_ipv6.c2
24 files changed, 2304 insertions, 1060 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a2c893a7475..ab65714d95f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN
changed would be a Beowulf compute cluster.
Default: 0
-tcp_westwood - BOOLEAN
- Enable TCP Westwood+ congestion control algorithm.
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
- Default: 0
-
-tcp_vegas_cong_avoid - BOOLEAN
- Enable TCP Vegas congestion avoidance algorithm.
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
- Default:0
-
-tcp_bic - BOOLEAN
- Enable BIC TCP congestion control algorithm.
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- Default: 0
-
-tcp_bic_low_window - INTEGER
- Sets the threshold window (in packets) where BIC TCP starts to
- adjust the congestion window. Below this threshold BIC TCP behaves
- the same as the default TCP Reno.
- Default: 14
-
-tcp_bic_fast_convergence - BOOLEAN
- Forces BIC TCP to more quickly respond to changes in congestion
- window. Allows two flows sharing the same connection to converge
- more rapidly.
- Default: 1
-
-tcp_default_win_scale - INTEGER
- Sets the minimum window scale TCP will negotiate for on all
- conections.
- Default: 7
-
tcp_tso_win_divisor - INTEGER
This allows control over what percentage of the congestion window
can be consumed by a single TSO frame.
@@ -368,6 +317,11 @@ tcp_frto - BOOLEAN
where packet loss is typically due to random radio interference
rather than intermediate router congestion.
+tcp_congestion_control - STRING
+ Set the congestion control algorithm to be used for new
+ connections. The algorithm "reno" is always available, but
+ additional choices may be available based on kernel configuration.
+
somaxconn - INTEGER
Limit of socket listen() backlog, known in userspace as SOMAXCONN.
Defaults to 128. See also tcp_max_syn_backlog for additional tuning
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
index 71749007091..0fa30042557 100644
--- a/Documentation/networking/tcp.txt
+++ b/Documentation/networking/tcp.txt
@@ -1,5 +1,72 @@
-How the new TCP output machine [nyi] works.
+TCP protocol
+============
+
+Last updated: 21 June 2005
+
+Contents
+========
+
+- Congestion control
+- How the new TCP output machine [nyi] works
+
+Congestion control
+==================
+
+The following variables are used in the tcp_sock for congestion control:
+snd_cwnd The size of the congestion window
+snd_ssthresh Slow start threshold. We are in slow start if
+ snd_cwnd is less than this.
+snd_cwnd_cnt A counter used to slow down the rate of increase
+ once we exceed slow start threshold.
+snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to.
+snd_cwnd_stamp Timestamp for when congestion window last validated.
+snd_cwnd_used Used as a highwater mark for how much of the
+ congestion window is in use. It is used to adjust
+ snd_cwnd down when the link is limited by the
+ application rather than the network.
+
+As of 2.6.13, Linux supports pluggable congestion control algorithms.
+A congestion control mechanism can be registered through functions in
+tcp_cong.c. The functions used by the congestion control mechanism are
+registered via passing a tcp_congestion_ops struct to
+tcp_register_congestion_control. As a minimum name, ssthresh,
+cong_avoid, min_cwnd must be valid.
+Private data for a congestion control mechanism is stored in tp->ca_priv.
+tcp_ca(tp) returns a pointer to this space. This is preallocated space - it
+is important to check the size of your private data will fit this space, or
+alternatively space could be allocated elsewhere and a pointer to it could
+be stored here.
+
+There are three kinds of congestion control algorithms currently: The
+simplest ones are derived from TCP reno (highspeed, scalable) and just
+provide an alternative the congestion window calculation. More complex
+ones like BIC try to look at other events to provide better
+heuristics. There are also round trip time based algorithms like
+Vegas and Westwood+.
+
+Good TCP congestion control is a complex problem because the algorithm
+needs to maintain fairness and performance. Please review current
+research and RFC's before developing new modules.
+
+The method that is used to determine which congestion control mechanism is
+determined by the setting of the sysctl net.ipv4.tcp_congestion_control.
+The default congestion control will be the last one registered (LIFO);
+so if you built everything as modules. the default will be reno. If you
+build with the default's from Kconfig, then BIC will be builtin (not a module)
+and it will end up the default.
+
+If you really want a particular default value then you will need
+to set it with the sysctl. If you use a sysctl, the module will be autoloaded
+if needed and you will get the expected protocol. If you ask for an
+unknown congestion method, then the sysctl attempt will fail.
+
+If you remove a tcp congestion control module, then you will get the next
+available one. Since reno can not be built as a module, and can not be
+deleted, it will always be available.
+
+How the new TCP output machine [nyi] works.
+===========================================
Data is kept on a single queue. The skb->users flag tells us if the frame is
one that has been queued already. To add a frame we throw it on the end. Ack
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 614e939c78a..72965bfe6cf 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
NET_TCP_FRTO=92,
NET_TCP_LOW_LATENCY=93,
NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
- NET_TCP_WESTWOOD=95,
NET_IPV4_IGMP_MAX_MSF=96,
NET_TCP_NO_METRICS_SAVE=97,
- NET_TCP_VEGAS=98,
- NET_TCP_VEGAS_ALPHA=99,
- NET_TCP_VEGAS_BETA=100,
- NET_TCP_VEGAS_GAMMA=101,
- NET_TCP_BIC=102,
- NET_TCP_BIC_FAST_CONVERGENCE=103,
- NET_TCP_BIC_LOW_WINDOW=104,
NET_TCP_DEFAULT_WIN_SCALE=105,
NET_TCP_MODERATE_RCVBUF=106,
NET_TCP_TSO_WIN_DIVISOR=107,
NET_TCP_BIC_BETA=108,
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+ NET_TCP_CONG_CONTROL=110,
};
enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97a7c9e03df..3ea75dd6640 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
__u32 end_seq;
};
-enum tcp_congestion_algo {
- TCP_RENO=0,
- TCP_VEGAS,
- TCP_WESTWOOD,
- TCP_BIC,
-};
-
struct tcp_options_received {
/* PAWS/RTTM data */
long ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
__u8 reordering; /* Packet reordering metric. */
__u8 frto_counter; /* Number of new acks after RTO */
- __u8 adv_cong; /* Using Vegas, Westwood, or BIC */
+ __u8 unused;
__u8 defer_accept; /* User waits for some data after accept() */
/* RTT measurement */
@@ -401,37 +394,10 @@ struct tcp_sock {
__u32 time;
} rcvq_space;
-/* TCP Westwood structure */
- struct {
- __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
- __u32 bw_est; /* bandwidth estimate */
- __u32 rtt_win_sx; /* here starts a new evaluation... */
- __u32 bk;
- __u32 snd_una; /* used for evaluating the number of acked bytes */
- __u32 cumul_ack;
- __u32 accounted;
- __u32 rtt;
- __u32 rtt_min; /* minimum observed RTT */
- } westwood;
-
-/* Vegas variables */
- struct {
- __u32 beg_snd_nxt; /* right edge during last RTT */
- __u32 beg_snd_una; /* left edge during last RTT */
- __u32 beg_snd_cwnd; /* saves the size of the cwnd */
- __u8 doing_vegas_now;/* if true, do vegas for this RTT */
- __u16 cntRTT; /* # of RTTs measured within last RTT */
- __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
- __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
- } vegas;
-
- /* BI TCP Parameters */
- struct {
- __u32 cnt; /* increase cwnd by 1 after this number of ACKs */
- __u32 last_max_cwnd; /* last maximium snd_cwnd */
- __u32 last_cwnd; /* the last snd_cwnd */
- __u32 last_stamp; /* time when updated last_cwnd */
- } bictcp;
+ /* Pluggable TCP congestion control hook */
+ struct tcp_congestion_ops *ca_ops;
+ u32 ca_priv[16];
+#define TCP_CA_PRIV_SIZE (16*sizeof(u32))
};
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
return (struct tcp_sock *)sk;
}
+static inline void *tcp_ca(const struct tcp_sock *tp)
+{
+ return (void *) tp->ca_priv;
+}
+
#endif
#endif /* _LINUX_TCP_H */
diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
index ceee962e1d1..7a599674394 100644
--- a/include/linux/tcp_diag.h
+++ b/include/linux/tcp_diag.h
@@ -99,9 +99,10 @@ enum
TCPDIAG_MEMINFO,
TCPDIAG_INFO,
TCPDIAG_VEGASINFO,
+ TCPDIAG_CONG,
};
-#define TCPDIAG_MAX TCPDIAG_VEGASINFO
+#define TCPDIAG_MAX TCPDIAG_CONG
/* TCPDIAG_MEM */
@@ -123,5 +124,4 @@ struct tcpvegas_info {
__u32 tcpv_minrtt;
};
-
#endif /* _TCP_DIAG_H_ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f730935b824..e427cf35915 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
#else
# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
#endif
-
-#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
- * max_cwnd = snd_cwnd * beta
- */
-#define BICTCP_MAX_INCREMENT 32 /*
- * Limit on the amount of
- * increment allowed during
- * binary search.
- */
-#define BICTCP_FUNC_OF_MIN_INCR 11 /*
- * log(B/Smin)/log(B/(B-1))+1,
- * Smin:min increment
- * B:log factor
- */
-#define BICTCP_B 4 /*
- * In binary search,
- * go to point (max+min)/N
- */
-
/*
* TCP option
*/
@@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_tw_reuse;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_westwood;
-extern int sysctl_tcp_vegas_cong_avoid;
-extern int sysctl_tcp_vegas_alpha;
-extern int sysctl_tcp_vegas_beta;
-extern int sysctl_tcp_vegas_gamma;
extern int sysctl_tcp_nometrics_save;
-extern int sysctl_tcp_bic;
-extern int sysctl_tcp_bic_fast_convergence;
-extern int sysctl_tcp_bic_low_window;
-extern int sysctl_tcp_bic_beta;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
@@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp,
tp->packets_out -= tcp_skb_pcount(skb);
}
+/* Events passed to congestion control interface */
+enum tcp_ca_event {
+ CA_EVENT_TX_START, /* first transmit when no packets in flight */
+ CA_EVENT_CWND_RESTART, /* congestion window restart */
+ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
+ CA_EVENT_FRTO, /* fast recovery timeout */
+ CA_EVENT_LOSS, /* loss timeout */
+ CA_EVENT_FAST_ACK, /* in sequence ack */
+ CA_EVENT_SLOW_ACK, /* other ack */
+};
+
+/*
+ * Interface for adding new TCP congestion control handlers
+ */
+#define TCP_CA_NAME_MAX 16
+struct tcp_congestion_ops {
+ struct list_head list;
+
+ /* initialize private data (optional) */
+ void (*init)(struct tcp_sock *tp);
+ /* cleanup private data (optional) */
+ void (*release)(struct tcp_sock *tp);
+
+ /* return slow start threshold (required) */
+ u32 (*ssthresh)(struct tcp_sock *tp);
+ /* lower bound for congestion window (optional) */
+ u32 (*min_cwnd)(struct tcp_sock *tp);
+ /* do new cwnd calculation (required) */
+ void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
+ u32 rtt, u32 in_flight, int good_ack);
+ /* round trip time sample per acked packet (optional) */
+ void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
+ /* call before changing ca_state (optional) */
+ void (*set_state)(struct tcp_sock *tp, u8 new_state);
+ /* call when cwnd event occurs (optional) */
+ void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
+ /* new value of cwnd after loss (optional) */
+ u32 (*undo_cwnd)(struct tcp_sock *tp);
+ /* hook for packet ack accounting (optional) */
+ void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
+ /* get info for tcp_diag (optional) */
+ void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
+
+ char name[TCP_CA_NAME_MAX];
+ struct module *owner;
+};
+
+extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
+extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+
+extern void tcp_init_congestion_control(struct tcp_sock *tp);
+extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
+extern int tcp_set_default_congestion_control(const char *name);
+extern void tcp_get_default_congestion_control(char *name);
+
+extern struct tcp_congestion_ops tcp_reno;
+extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
+extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
+ u32 rtt, u32 in_flight, int flag);
+extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
+
+static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
+{
+ if (tp->ca_ops->set_state)
+ tp->ca_ops->set_state(tp, ca_state);
+ tp->ca_state = ca_state;
+}
+
+static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+ if (tp->ca_ops->cwnd_event)
+ tp->ca_ops->cwnd_event(tp, event);
+}
+
/* This determines how many packets are "in the network" to the best
* of our knowledge. In many cases it is conservative, but where
* detailed information is available from the receiver (via SACK
@@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
return (tp->packets_out - tp->left_out + tp->retrans_out);
}
-/*
- * Which congestion algorithim is in use on the connection.
- */
-#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS)
-#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD)
-#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC)
-
-/* Recalculate snd_ssthresh, we want to set it to:
- *
- * Reno:
- * one half the current congestion window, but no
- * less than two segments
- *
- * BIC:
- * behave like Reno until low_window is reached,
- * then increase congestion window slowly
- */
-static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
-{
- if (tcp_is_bic(tp)) {
- if (sysctl_tcp_bic_fast_convergence &&
- tp->snd_cwnd < tp->bictcp.last_max_cwnd)
- tp->bictcp.last_max_cwnd = (tp->snd_cwnd *
- (BICTCP_BETA_SCALE
- + sysctl_tcp_bic_beta))
- / (2 * BICTCP_BETA_SCALE);
- else
- tp->bictcp.last_max_cwnd = tp->snd_cwnd;
-
- if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
- return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
- / BICTCP_BETA_SCALE, 2U);
- }
-
- return max(tp->snd_cwnd >> 1U, 2U);
-}
-
-/* Stop taking Vegas samples for now. */
-#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
-
-static inline void tcp_vegas_enable(struct tcp_sock *tp)
-{
- /* There are several situations when we must "re-start" Vegas:
- *
- * o when a connection is established
- * o after an RTO
- * o after fast recovery
- * o when we send a packet and there is no outstanding
- * unacknowledged data (restarting an idle connection)
- *
- * In these circumstances we cannot do a Vegas calculation at the
- * end of the first RTT, because any calculation we do is using
- * stale info -- both the saved cwnd and congestion feedback are
- * stale.
- *
- * Instead we must wait until the completion of an RTT during
- * which we actually receive ACKs.
- */
-
- /* Begin taking Vegas samples next time we send something. */
- tp->vegas.doing_vegas_now = 1;
-
- /* Set the beginning of the next send window. */
- tp->vegas.beg_snd_nxt = tp->snd_nxt;
-
- tp->vegas.cntRTT = 0;
- tp->vegas.minRTT = 0x7fffffff;
-}
-
-/* Should we be taking Vegas samples right now? */
-#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
-
-extern void tcp_ca_init(struct tcp_sock *tp);
-
-static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
-{
- if (tcp_is_vegas(tp)) {
- if (ca_state == TCP_CA_Open)
- tcp_vegas_enable(tp);
- else
- tcp_vegas_disable(tp);
- }
- tp->ca_state = ca_state;
-}
-
/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
* The exception is rate halving phase, when cwnd is decreasing towards
* ssthresh.
@@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
static inline void __tcp_enter_cwr(struct tcp_sock *tp)
{
tp->undo_marker = 0;
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp) + 1U);
tp->snd_cwnd_cnt = 0;
@@ -1876,52 +1837,4 @@ struct tcp_iter_state {
extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
-/* TCP Westwood functions and constants */
-
-#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
-#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
-
-static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
-{
- if (tcp_is_westwood(tp))
- tp->westwood.rtt = rtt_seq;
-}
-
-static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
- return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
- (__u32) (tp->mss_cache_std),
- 2U);
-}
-
-static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
- return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
-}
-
-static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
-{
- __u32 ssthresh = 0;
-
- if (tcp_is_westwood(tp)) {
- ssthresh = __tcp_westwood_bw_rttmin(tp);
- if (ssthresh)
- tp->snd_ssthresh = ssthresh;
- }
-
- return (ssthresh != 0);
-}
-
-static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
-{
- __u32 cwnd = 0;
-
- if (tcp_is_westwood(tp)) {
- cwnd = __tcp_westwood_bw_rttmin(tp);
- if (cwnd)
- tp->snd_cwnd = cwnd;
- }
-
- return (cwnd != 0);
-}
#endif /* _TCP_H */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c34..690e88ba248 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -433,5 +433,95 @@ config IP_TCPDIAG
config IP_TCPDIAG_IPV6
def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+# TCP Reno is builtin (required as fallback)
+menu "TCP congestion control"
+ depends on INET
+
+config TCP_CONG_BIC
+ tristate "Binary Increase Congestion (BIC) control"
+ depends on INET
+ default y
+ ---help---
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_WESTWOOD
+ tristate "TCP Westwood+"
+ depends on INET
+ default m
+ ---help---
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+ tristate "H-TCP"
+ depends on INET
+ default m
+ ---help---
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+ tristate "High Speed TCP"
+ depends on INET && EXPERIMENTAL
+ default n
+ ---help---
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+ tristate "TCP-Hybla congestion control algorithm"
+ depends on INET && EXPERIMENTAL
+ default n
+ ---help---
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, expecially when sharing a common bottleneck with normal
+ terrestrial connections.
+
+config TCP_CONG_VEGAS
+ tristate "TCP Vegas"
+ depends on INET && EXPERIMENTAL
+ default n
+ ---help---
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+ tristate "Scalable TCP"
+ depends on INET && EXPERIMENTAL
+ default n
+ ---help---
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+
+endmenu
+
source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1ad..5718cdb3a61 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
obj-y := utils.o route.o inetpeer.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
ip_output.o ip_sockglue.o \
- tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0..e3289453241 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
return 1;
}
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char val[TCP_CA_NAME_MAX];
+ ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+
+ ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
+{
+ char val[TCP_CA_NAME_MAX];
+ ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+ ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+ context);
+ if (ret == 0 && newval && newlen)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+
ctl_table ipv4_table[] = {
{
.ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec,
},
{
- .ctl_name = NET_TCP_WESTWOOD,
- .procname = "tcp_westwood",
- .data = &sysctl_tcp_westwood,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS,
- .procname = "tcp_vegas_cong_avoid",
- .data = &sysctl_tcp_vegas_cong_avoid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_ALPHA,
- .procname = "tcp_vegas_alpha",
- .data = &sysctl_tcp_vegas_alpha,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_BETA,
- .procname = "tcp_vegas_beta",
- .data = &sysctl_tcp_vegas_beta,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_GAMMA,
- .procname = "tcp_vegas_gamma",
- .data = &sysctl_tcp_vegas_gamma,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC,
- .procname = "tcp_bic",
- .data = &sysctl_tcp_bic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
- .procname = "tcp_bic_fast_convergence",
- .data = &sysctl_tcp_bic_fast_convergence,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC_LOW_WINDOW,
- .procname = "tcp_bic_low_window",
- .data = &sysctl_tcp_bic_low_window,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
.ctl_name = NET_TCP_MODERATE_RCVBUF,
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec,
},
{
- .ctl_name = NET_TCP_BIC_BETA,
- .procname = "tcp_bic_beta",
- .data = &sysctl_tcp_bic_beta,
- .maxlen = sizeof(int),
+ .ctl_name = NET_TCP_CONG_CONTROL,
+ .procname = "tcp_congestion_control",
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = &proc_tcp_congestion_control,
+ .strategy = &sysctl_tcp_congestion_control,
},
+
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd3..f3dbc8dc126 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2333,6 +2333,8 @@ void __init tcp_init(void)
printk(KERN_INFO "TCP: Hash tables configured "
"(established %d bind %d)\n",
tcp_ehash_size << 1, tcp_bhash_size);
+
+ tcp_register_congestion_control(&tcp_reno);
}
EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 00000000000..ec38d45d664
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
+/*
+ * Binary Increase Congestion control for TCP
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ * "Binary Increase Congestion Control for Fast, Long Dist