diff options
Diffstat (limited to 'net/ipv4/tcp_cong.c')
| -rw-r--r-- | net/ipv4/tcp_cong.c | 215 |
1 files changed, 166 insertions, 49 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,16 +1,18 @@ /* * Plugable TCP congestion control support and newReno * congestion control. - * Based on ideas from I/O scheduler suport and Web100. + * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ -#include <linux/config.h> +#define pr_fmt(fmt) "TCP: " fmt + #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> +#include <linux/gfp.h> #include <net/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); @@ -30,7 +32,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* - * Attach new congestion control algorthim to the list + * Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) @@ -38,19 +40,18 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { - printk(KERN_ERR "TCP %s does not implement required ops\n", - ca->name); + if (!ca->ssthresh || !ca->cong_avoid) { + pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } spin_lock(&tcp_cong_list_lock); if (tcp_ca_find(ca->name)) { - printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + pr_notice("%s already registered\n", ca->name); ret = -EEXIST; } else { - list_add_rcu(&ca->list, &tcp_cong_list); - printk(KERN_INFO "TCP %s registered\n", ca->name); + list_add_tail_rcu(&ca->list, &tcp_cong_list); + pr_info("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); @@ -78,18 +79,19 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) - return; + /* if no choice made yet assign the current value set as default */ + if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + icsk->icsk_ca_ops = ca; + break; + } - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; + /* fallback to next available */ } - + rcu_read_unlock(); } - rcu_read_unlock(); if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -113,8 +115,8 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); -#ifdef CONFIG_KMOD - if (!ca) { +#ifdef CONFIG_MODULES + if (!ca && capable(CAP_NET_ADMIN)) { spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); @@ -124,6 +126,7 @@ int tcp_set_default_congestion_control(const char *name) #endif if (ca) { + ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ list_move(&ca->list, &tcp_cong_list); ret = 0; } @@ -132,6 +135,30 @@ int tcp_set_default_congestion_control(const char *name) return ret; } +/* Set default value from kernel configuration at bootup */ +static int __init tcp_congestion_default(void) +{ + return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); +} +late_initcall(tcp_congestion_default); + + +/* Build string with list of available congestion control values */ +void tcp_get_available_congestion_control(char *buf, size_t maxlen) +{ + struct tcp_congestion_ops *ca; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ca->name); + + } + rcu_read_unlock(); +} + /* Get current default congestion control */ void tcp_get_default_congestion_control(char *name) { @@ -145,6 +172,65 @@ void tcp_get_default_congestion_control(char *name) rcu_read_unlock(); } +/* Built list of non-restricted congestion control values */ +void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) +{ + struct tcp_congestion_ops *ca; + size_t offs = 0; + + *buf = '\0'; + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) + continue; + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ca->name); + + } + rcu_read_unlock(); +} + +/* Change list of non-restricted congestion control */ +int tcp_set_allowed_congestion_control(char *val) +{ + struct tcp_congestion_ops *ca; + char *saved_clone, *clone, *name; + int ret = 0; + + saved_clone = clone = kstrdup(val, GFP_USER); + if (!clone) + return -ENOMEM; + + spin_lock(&tcp_cong_list_lock); + /* pass 1 check for bad entries */ + while ((name = strsep(&clone, " ")) && *name) { + ca = tcp_ca_find(name); + if (!ca) { + ret = -ENOENT; + goto out; + } + } + + /* pass 2 clear old values */ + list_for_each_entry_rcu(ca, &tcp_cong_list, list) + ca->flags &= ~TCP_CONG_NON_RESTRICTED; + + /* pass 3 mark as allowed */ + while ((name = strsep(&val, " ")) && *name) { + ca = tcp_ca_find(name); + WARN_ON(!ca); + if (ca) + ca->flags |= TCP_CONG_NON_RESTRICTED; + } +out: + spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); + + return ret; +} + + /* Change congestion control for socket */ int tcp_set_congestion_control(struct sock *sk, const char *name) { @@ -154,19 +240,35 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = tcp_ca_find(name); + + /* no change asking for existing value */ if (ca == icsk->icsk_ca_ops) goto out; +#ifdef CONFIG_MODULES + /* not found attempt to autoload module */ + if (!ca && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("tcp_%s", name); + rcu_read_lock(); + ca = tcp_ca_find(name); + } +#endif if (!ca) err = -ENOENT; + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) + err = -EPERM; + else if (!try_module_get(ca->owner)) err = -EBUSY; else { tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; - if (icsk->icsk_ca_ops->init) + + if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); } out: @@ -174,6 +276,40 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } +/* Slow start is used when congestion window is no greater than the slow start + * threshold. We base on RFC2581 and also handle stretch ACKs properly. + * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but + * something better;) a packet is only considered (s)acked in its entirety to + * defend the ACK attacks described in the RFC. Slow start processes a stretch + * ACK of degree N as if N acks of degree 1 are received back to back except + * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and + * returns the leftover acks to adjust cwnd in congestion avoidance mode. + */ +int tcp_slow_start(struct tcp_sock *tp, u32 acked) +{ + u32 cwnd = tp->snd_cwnd + acked; + + if (cwnd > tp->snd_ssthresh) + cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + return acked; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +{ + if (tp->snd_cwnd_cnt >= w) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); + /* * TCP Reno congestion control * This is special case used for fallback as well. @@ -181,29 +317,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, - int flag) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp, acked); + /* In dangerous area, increase slowly. */ + else + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); @@ -215,20 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - struct tcp_congestion_ops tcp_reno = { + .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) @@ -240,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); |
