aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_cong.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_cong.c')
-rw-r--r--net/ipv4/tcp_cong.c237
1 files changed, 159 insertions, 78 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index e688c687d62..7b09d8b49fa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,16 +1,18 @@
/*
* Plugable TCP congestion control support and newReno
* congestion control.
- * Based on ideas from I/O scheduler suport and Web100.
+ * Based on ideas from I/O scheduler support and Web100.
*
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
*/
-#include <linux/config.h>
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -30,7 +32,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
}
/*
- * Attach new congestion control algorthim to the list
+ * Attach new congestion control algorithm to the list
* of available options.
*/
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
@@ -38,19 +40,18 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
int ret = 0;
/* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
- printk(KERN_ERR "TCP %s does not implement required ops\n",
- ca->name);
+ if (!ca->ssthresh || !ca->cong_avoid) {
+ pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
spin_lock(&tcp_cong_list_lock);
if (tcp_ca_find(ca->name)) {
- printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+ pr_notice("%s already registered\n", ca->name);
ret = -EEXIST;
} else {
- list_add_rcu(&ca->list, &tcp_cong_list);
- printk(KERN_INFO "TCP %s registered\n", ca->name);
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ pr_info("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
@@ -78,18 +79,19 @@ void tcp_init_congestion_control(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
- if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
- return;
+ /* if no choice made yet assign the current value set as default */
+ if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (try_module_get(ca->owner)) {
+ icsk->icsk_ca_ops = ca;
+ break;
+ }
- rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (try_module_get(ca->owner)) {
- icsk->icsk_ca_ops = ca;
- break;
+ /* fallback to next available */
}
-
+ rcu_read_unlock();
}
- rcu_read_unlock();
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
@@ -113,8 +115,8 @@ int tcp_set_default_congestion_control(const char *name)
spin_lock(&tcp_cong_list_lock);
ca = tcp_ca_find(name);
-#ifdef CONFIG_KMOD
- if (!ca) {
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
spin_unlock(&tcp_cong_list_lock);
request_module("tcp_%s", name);
@@ -124,6 +126,7 @@ int tcp_set_default_congestion_control(const char *name)
#endif
if (ca) {
+ ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
list_move(&ca->list, &tcp_cong_list);
ret = 0;
}
@@ -132,6 +135,30 @@ int tcp_set_default_congestion_control(const char *name)
return ret;
}
+/* Set default value from kernel configuration at bootup */
+static int __init tcp_congestion_default(void)
+{
+ return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+}
+late_initcall(tcp_congestion_default);
+
+
+/* Build string with list of available congestion control values */
+void tcp_get_available_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+
+ }
+ rcu_read_unlock();
+}
+
/* Get current default congestion control */
void tcp_get_default_congestion_control(char *name)
{
@@ -145,6 +172,65 @@ void tcp_get_default_congestion_control(char *name)
rcu_read_unlock();
}
+/* Built list of non-restricted congestion control values */
+void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ *buf = '\0';
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
+ continue;
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+
+ }
+ rcu_read_unlock();
+}
+
+/* Change list of non-restricted congestion control */
+int tcp_set_allowed_congestion_control(char *val)
+{
+ struct tcp_congestion_ops *ca;
+ char *saved_clone, *clone, *name;
+ int ret = 0;
+
+ saved_clone = clone = kstrdup(val, GFP_USER);
+ if (!clone)
+ return -ENOMEM;
+
+ spin_lock(&tcp_cong_list_lock);
+ /* pass 1 check for bad entries */
+ while ((name = strsep(&clone, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ if (!ca) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ /* pass 2 clear old values */
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list)
+ ca->flags &= ~TCP_CONG_NON_RESTRICTED;
+
+ /* pass 3 mark as allowed */
+ while ((name = strsep(&val, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ WARN_ON(!ca);
+ if (ca)
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
+ }
+out:
+ spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
+
+ return ret;
+}
+
+
/* Change congestion control for socket */
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
@@ -154,19 +240,35 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
rcu_read_lock();
ca = tcp_ca_find(name);
+
+ /* no change asking for existing value */
if (ca == icsk->icsk_ca_ops)
goto out;
+#ifdef CONFIG_MODULES
+ /* not found attempt to autoload module */
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp_%s", name);
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ }
+#endif
if (!ca)
err = -ENOENT;
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+ err = -EPERM;
+
else if (!try_module_get(ca->owner))
err = -EBUSY;
else {
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
- if (icsk->icsk_ca_ops->init)
+
+ if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}
out:
@@ -174,34 +276,40 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
-
-/*
- * Linear increase during slow start
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- if (sysctl_tcp_abc) {
- /* RFC3465: Slow Start
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (tp->bytes_acked < tp->mss_cache)
- return;
-
- /* We MAY increase by 2 if discovered delayed ack */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- }
- tp->bytes_acked = 0;
+ u32 cwnd = tp->snd_cwnd + acked;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+{
+ if (tp->snd_cwnd_cnt >= w) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else {
+ tp->snd_cwnd_cnt++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
@@ -209,37 +317,19 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
- int flag)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
-
- /* In dangerous area, increase slowly. */
- else if (sysctl_tcp_abc) {
- /* RFC3465: Apppriate Byte Count
- * increase once for each full cwnd acked
- */
- if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
- tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
- /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ /* In dangerous area, increase slowly. */
+ else
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
@@ -251,20 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- return tp->snd_ssthresh/2;
-}
-EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
-
struct tcp_congestion_ops tcp_reno = {
+ .flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
/* Initial congestion control used (until SYN)
@@ -276,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = {
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);