diff options
author | Stephen Hemminger <shemminger@osdl.org> | 2005-06-23 12:19:55 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2005-06-23 12:19:55 -0700 |
commit | 317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch) | |
tree | caeba9839dee264f59b035b81c3d13d6c61b638e | |
parent | a8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff) |
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules. The legacy "new RENO" algorithm is used as a starting
point and fallback.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/sysctl.h | 9 | ||||
-rw-r--r-- | include/linux/tcp.h | 49 | ||||
-rw-r--r-- | include/net/tcp.h | 237 | ||||
-rw-r--r-- | net/ipv4/Makefile | 3 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 114 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 195 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 737 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 |
13 files changed, 399 insertions, 999 deletions
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a..72965bfe6cf 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df..3ea75dd6640 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824..e427cf35915 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_MAX_INCREMENT 32 /* - * Limit on the amount of - * increment allowed during - * binary search. - */ -#define BICTCP_FUNC_OF_MIN_INCR 11 /* - * log(B/Smin)/log(B/(B-1))+1, - * Smin:min increment - * B:log factor - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - /* * TCP option */ @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; -extern int sysctl_tcp_vegas_alpha; -extern int sysctl_tcp_vegas_beta; -extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; -extern int sysctl_tcp_bic_fast_convergence; -extern int sysctl_tcp_bic_low_window; -extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { + struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct tcp_sock *tp); + /* cleanup private data (optional) */ + void (*release)(struct tcp_sock *tp); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct tcp_sock *tp); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct tcp_sock *tp); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct tcp_sock *tp, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct tcp_sock *tp); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + /* get info for tcp_diag (optional) */ + void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); +extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); + +extern void tcp_init_congestion_control(struct tcp_sock *tp); +extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern int tcp_set_default_congestion_control(const char *name); +extern void tcp_get_default_congestion_control(char *name); + +extern struct tcp_congestion_ops tcp_reno; +extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); +extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int flag); +extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tp->ca_ops->set_state) + tp->ca_ops->set_state(tp, ca_state); + tp->ca_state = ca_state; +} + +static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (tp->ca_ops->cwnd_event) + tp->ca_ops->cwnd_event(tp, event); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } -/* - * Which congestion algorithim is in use on the connection. - */ -#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) -#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) -#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) - -/* Recalculate snd_ssthresh, we want to set it to: - * - * Reno: - * one half the current congestion window, but no - * less than two segments - * - * BIC: - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) -{ - if (tcp_is_bic(tp)) { - if (sysctl_tcp_bic_fast_convergence && - tp->snd_cwnd < tp->bictcp.last_max_cwnd) - tp->bictcp.last_max_cwnd = (tp->snd_cwnd * - (BICTCP_BETA_SCALE - + sysctl_tcp_bic_beta)) - / (2 * BICTCP_BETA_SCALE); - else - tp->bictcp.last_max_cwnd = tp->snd_cwnd; - - if (tp->snd_cwnd > sysctl_tcp_bic_low_window) - return max((tp->snd_cwnd * sysctl_tcp_bic_beta) - / BICTCP_BETA_SCALE, 2U); - } - - return max(tp->snd_cwnd >> 1U, 2U); -} - -/* Stop taking Vegas samples for now. */ -#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) - -static inline void tcp_vegas_enable(struct tcp_sock *tp) -{ - /* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ - - /* Begin taking Vegas samples next time we send something. */ - tp->vegas.doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - tp->vegas.beg_snd_nxt = tp->snd_nxt; - - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; -} - -/* Should we be taking Vegas samples right now? */ -#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) - -extern void tcp_ca_init(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tcp_is_vegas(tp)) { - if (ca_state == TCP_CA_Open) - tcp_vegas_enable(tp); - else - tcp_vegas_disable(tp); - } - tp->ca_state = ca_state; -} - /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1876,52 +1837,4 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); -/* TCP Westwood functions and constants */ - -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ - -static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) -{ - if (tcp_is_westwood(tp)) - tp->westwood.rtt = rtt_seq; -} - -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), - 2U); -} - -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - -static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) -{ - __u32 ssthresh = 0; - - if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); - if (ssthresh) - tp->snd_ssthresh = ssthresh; - } - - return (ssthresh != 0); -} - -static inline int tcp_westwood_cwnd(struct tcp_sock *tp) -{ - __u32 cwnd = 0; - - if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); - if (cwnd) - tp->snd_cwnd = cwnd; - } - - return (cwnd != 0); -} #endif /* _TCP_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1ad..89c0b4cb470 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,8 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0..e3289453241 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, + context); + if (ret == 0 && newval && newlen) + ret = tcp_set_default_congestion_control(val); + return ret; +} + + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_ALPHA, - .procname = "tcp_vegas_alpha", - .data = &sysctl_tcp_vegas_alpha, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_BETA, - .procname = "tcp_vegas_beta", - .data = &sysctl_tcp_vegas_beta, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_GAMMA, - .procname = "tcp_vegas_gamma", - .data = &sysctl_tcp_vegas_gamma, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, - .procname = "tcp_bic_fast_convergence", - .data = &sysctl_tcp_bic_fast_convergence, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_LOW_WINDOW, - .procname = "tcp_bic_low_window", - .data = &sysctl_tcp_bic_low_window, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC_BETA, - .procname = "tcp_bic_beta", - .data = &sysctl_tcp_bic_beta, - .maxlen = sizeof(int), + .ctl_name = NET_TCP_CONG_CONTROL, + .procname = "tcp_congestion_control", .mode = 0644, - .proc_handler = &proc_dointvec, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = &proc_tcp_congestion_control, + .strategy = &sysctl_tcp_congestion_control, }, + { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd3..f3dbc8dc126 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2333,6 +2333,8 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); + + tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 00000000000..665394a63ae --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,195 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/list.h> +#include <net/tcp.h> + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct tcp_sock *tp) +{ + struct tcp_congestion_ops *ca; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + tp->ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (tp->ca_ops->init) + tp->ca_ops->init(tp); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct tcp_sock *tp) +{ + if (tp->ca_ops->release) + tp->ca_ops->release(tp); + module_put(tp->ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +EXPORT_SYMBOL_GPL(tcp_reno); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc0792..867acc0f79d 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -42,7 +42,6 @@ struct tcpdiag_entry static struct sock *tcpnl; - #define TCPDIAG_PUT(skb, attrtype, attrlen) \ ({ int rtalen = RTA_LENGTH(attrlen); \ struct rtattr *rta; \ @@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; - struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) - && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) - vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (vinfo) { - if (tcp_is_vegas(tp)) { - vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; - vinfo->tcpv_rttcnt = tp->vegas.cntRTT; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); - } else { - vinfo->tcpv_enabled = 0; - vinfo->tcpv_rttcnt = 0; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); - } - } + if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) + tp->ca_ops->get_info(tp, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a..7bbbbc33eb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,7 +61,6 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs - * Angelo Dell'Aera: TCP Westwood+ support */ #include <linux/config.h> @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; -int sysctl_tcp_bic = 1; -int sysctl_tcp_bic_fast_convergence = 1; -int sysctl_tcp_bic_low_window = 14; -int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ - #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static void init_bictcp(struct tcp_sock *tp) -{ - tp->bictcp.cnt = 0; - - tp->bictcp.last_max_cwnd = 0; - tp->bictcp.last_cwnd = 0; - tp->bictcp.last_stamp = 0; -} - /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } -/* When starting a new connection, pin down the current choice of - * congestion algorithm. - */ -void tcp_ca_init(struct tcp_sock *tp) -{ - if (sysctl_tcp_westwood) - tp->adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) -{ - __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < tp->vegas.baseRTT) - tp->vegas.baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); - tp->vegas.cntRTT++; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) { long m = mrtt; /* RTT */ - if (tcp_vegas_enabled(tp)) - vegas_rtt_calc(tp, mrtt); - /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) tp->rtt_seq = tp->snd_nxt; } - tcp_westwood_update_rtt(tp, tp->srtt >> 3); + if (tp->ca_ops->rtt_sample) + tp->ca_ops->rtt_sample(tp, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - if (!tcp_westwood_ssthresh(tp)) - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - - init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ - static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; - __u32 limit; - - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ - - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > limit) + if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_s |