aboutsummaryrefslogtreecommitdiff
path: root/net/sched/sch_netem.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched/sch_netem.c')
-rw-r--r--net/sched/sch_netem.c977
1 files changed, 747 insertions, 230 deletions
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d5003..111d70fddae 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -4,26 +4,32 @@
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * 2 of the License.
*
* Many of the algorithms and ideas for this came from
- * NIST Net which is not copyrighted.
+ * NIST Net which is not copyrighted.
*
* Authors: Stephen Hemminger <shemminger@osdl.org>
* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
*/
-#include <linux/config.h>
+#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
-#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
+#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define VERSION "1.3"
/* Network Emulation Queuing algorithm.
====================================
@@ -47,76 +53,285 @@
control either since that can be handled by using token
bucket or other rate control.
- The simulator is limited by the Linux timer resolution
- and will create packet bursts on the HZ boundary (1ms).
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
struct netem_sched_data {
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
+
+ /* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
- struct timer_list timer;
- u32 latency;
+ struct qdisc_watchdog watchdog;
+
+ psched_tdiff_t latency;
+ psched_tdiff_t jitter;
+
u32 loss;
+ u32 ecn;
u32 limit;
u32 counter;
u32 gap;
- u32 jitter;
u32 duplicate;
u32 reorder;
+ u32 corrupt;
+ u64 rate;
+ s32 packet_overhead;
+ u32 cell_size;
+ struct reciprocal_value cell_size_reciprocal;
+ s32 cell_overhead;
struct crndstate {
- unsigned long last;
- unsigned long rho;
- } delay_cor, loss_cor, dup_cor, reorder_cor;
+ u32 last;
+ u32 rho;
+ } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
struct disttable {
u32 size;
s16 table[0];
} *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ enum {
+ TX_IN_GAP_PERIOD = 1,
+ TX_IN_BURST_PERIOD,
+ LOST_IN_GAP_PERIOD,
+ LOST_IN_BURST_PERIOD,
+ } _4_state_model;
+
+ enum {
+ GOOD_STATE = 1,
+ BAD_STATE,
+ } GE_state_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
};
-/* Time stamp put into socket buffer control block */
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ */
struct netem_skb_cb {
psched_time_t time_to_send;
+ ktime_t tstamp_save;
};
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+ BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+ offsetof(struct sk_buff, next) + sizeof(skb->next));
+ BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+ offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+ BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+ sizeof(skb->prev) +
+ sizeof(skb->tstamp));
+ return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+ return (struct sk_buff *)rb;
+}
+
+static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
+{
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
+ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
+ return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
/* init_crandom - initialize correlated random number generator
* Use entropy source for initial seed.
*/
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
- state->last = net_random();
+ state->last = prandom_u32();
}
/* get_crandom - correlated random number generator
* Next number depends on last value.
* rho is scaled to avoid floating point.
*/
-static unsigned long get_crandom(struct crndstate *state)
+static u32 get_crandom(struct crndstate *state)
{
u64 value, rho;
unsigned long answer;
- if (state->rho == 0) /* no correllation */
- return net_random();
+ if (state->rho == 0) /* no correlation */
+ return prandom_u32();
- value = net_random();
+ value = prandom_u32();
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
return answer;
}
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = prandom_u32();
+
+ /*
+ * Makes a comparison between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+ * LOST_IN_BURST_PERIOD => isolated losses within a gap period
+ * LOST_IN_GAP_PERIOD => lost packets within a burst period
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case TX_IN_GAP_PERIOD:
+ if (rnd < clg->a4) {
+ clg->state = LOST_IN_BURST_PERIOD;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else if (clg->a1 + clg->a4 < rnd) {
+ clg->state = TX_IN_GAP_PERIOD;
+ }
+
+ break;
+ case TX_IN_BURST_PERIOD:
+ if (rnd < clg->a5) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else {
+ clg->state = TX_IN_BURST_PERIOD;
+ }
+
+ break;
+ case LOST_IN_GAP_PERIOD:
+ if (rnd < clg->a3)
+ clg->state = TX_IN_BURST_PERIOD;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = TX_IN_GAP_PERIOD;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ }
+ break;
+ case LOST_IN_BURST_PERIOD:
+ clg->state = TX_IN_GAP_PERIOD;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case GOOD_STATE:
+ if (prandom_u32() < clg->a1)
+ clg->state = BAD_STATE;
+ if (prandom_u32() < clg->a4)
+ return true;
+ break;
+ case BAD_STATE:
+ if (prandom_u32() < clg->a2)
+ clg->state = GOOD_STATE;
+ if (prandom_u32() > clg->a3)
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
*/
-static long tabledist(unsigned long mu, long sigma,
- struct crndstate *state, const struct disttable *dist)
+static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
+ struct crndstate *state,
+ const struct disttable *dist)
{
- long t, x;
- unsigned long rnd;
+ psched_tdiff_t x;
+ long t;
+ u32 rnd;
if (sigma == 0)
return mu;
@@ -124,7 +339,7 @@ static long tabledist(unsigned long mu, long sigma,
rnd = get_crandom(state);
/* default uniform distribution */
- if (dist == NULL)
+ if (dist == NULL)
return (rnd % (2*sigma)) - sigma + mu;
t = dist->table[rnd % dist->size];
@@ -137,6 +352,62 @@ static long tabledist(unsigned long mu, long sigma,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
+static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
+{
+ u64 ticks;
+
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+ if (len > cells * q->cell_size) /* extra cell needed for remainder */
+ cells++;
+ len = cells * (q->cell_size + q->cell_overhead);
+ }
+
+ ticks = (u64)len * NSEC_PER_SEC;
+
+ do_div(ticks, q->rate);
+ return PSCHED_NS2TICKS(ticks);
+}
+
+static void tfifo_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ while ((p = rb_first(&q->t_root))) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ skb->next = NULL;
+ skb->prev = NULL;
+ kfree_skb(skb);
+ }
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = netem_rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(netem_rb_node(nskb), parent, p);
+ rb_insert_color(netem_rb_node(nskb), &q->t_root);
+ sch->q.qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -146,95 +417,148 @@ static long tabledist(unsigned long mu, long sigma,
static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
+ /* We don't fill cb now as skb_unshare() may invalidate it */
+ struct netem_skb_cb *cb;
struct sk_buff *skb2;
- int ret;
int count = 1;
- pr_debug("netem_enqueue skb=%p\n", skb);
-
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
- /* Random packet drop 0 => none, ~0 => all */
- if (q->loss && q->loss >= get_crandom(&q->loss_cor))
- --count;
-
+ /* Drop packet? */
+ if (loss_event(q)) {
+ if (q->ecn && INET_ECN_set_ce(skb))
+ sch->qstats.drops++; /* mark packet */
+ else
+ --count;
+ }
if (count == 0) {
sch->qstats.drops++;
kfree_skb(skb);
- return NET_XMIT_DROP;
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
+ /* If a delay is expected, orphan the skb. (orphaning usually takes
+ * place at TX completion time, so _before_ the link transit delay)
+ */
+ if (q->latency || q->jitter)
+ skb_orphan_partial(skb);
+
/*
* If we need to duplicate packet, then re-insert at top of the
* qdisc tree, since parent queuer expects that only one
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
- struct Qdisc *rootq = sch->dev->qdisc;
+ struct Qdisc *rootq = qdisc_root(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
- rootq->enqueue(skb2, rootq);
+ qdisc_enqueue_root(skb2, rootq);
q->duplicate = dupsave;
}
- if (q->gap == 0 /* not doing reordering */
- || q->counter < q->gap /* inside last reordering gap */
- || q->reorder < get_crandom(&q->reorder_cor)) {
+ /*
+ * Randomized packet corruption.
+ * Make copy if needed since we are modifying
+ * If packet is going to be hardware checksummed, then
+ * do it now in software before we mangle it.
+ */
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb)))
+ return qdisc_drop(skb, sch);
+
+ skb->data[prandom_u32() % skb_headlen(skb)] ^=
+ 1<<(prandom_u32() % 8);
+ }
+
+ if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ return qdisc_reshape_fail(skb, sch);
+
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+
+ cb = netem_skb_cb(skb);
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap - 1 || /* inside last reordering gap */
+ q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
- PSCHED_GET_TIME(now);
- PSCHED_TADD2(now, tabledist(q->latency, q->jitter,
- &q->delay_cor, q->delay_dist),
- cb->time_to_send);
+ psched_tdiff_t delay;
+
+ delay = tabledist(q->latency, q->jitter,
+ &q->delay_cor, q->delay_dist);
+
+ now = psched_get_time();
+
+ if (q->rate) {
+ struct sk_buff *last;
+
+ if (!skb_queue_empty(&sch->q))
+ last = skb_peek_tail(&sch->q);
+ else
+ last = netem_rb_to_skb(rb_last(&q->t_root));
+ if (last) {
+ /*
+ * Last packet in queue is reference point (now),
+ * calculate this time bonus and subtract
+ * from delay.
+ */
+ delay -= netem_skb_cb(last)->time_to_send - now;
+ delay = max_t(psched_tdiff_t, 0, delay);
+ now = netem_skb_cb(last)->time_to_send;
+ }
+
+ delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
+ }
+
+ cb->time_to_send = now + delay;
+ cb->tstamp_save = skb->tstamp;
++q->counter;
- ret = q->qdisc->enqueue(skb, q->qdisc);
+ tfifo_enqueue(skb, sch);
} else {
- /*
+ /*
* Do re-ordering by putting one out of N packets at the front
* of the queue.
*/
- PSCHED_GET_TIME(cb->time_to_send);
+ cb->time_to_send = psched_get_time();
q->counter = 0;
- ret = q->qdisc->ops->requeue(skb, q->qdisc);
- }
-
- if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->q.qlen++;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- } else
- sch->qstats.drops++;
-
- pr_debug("netem: enqueue ret %d\n", ret);
- return ret;
-}
-/* Requeue packets but don't change time stamp */
-static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
-{
- struct netem_sched_data *q = qdisc_priv(sch);
- int ret;
-
- if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
- sch->q.qlen++;
+ __skb_queue_head(&sch->q, skb);
sch->qstats.requeues++;
}
- return ret;
+ return NET_XMIT_SUCCESS;
}
-static unsigned int netem_drop(struct Qdisc* sch)
+static unsigned int netem_drop(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
unsigned int len;
- if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
- sch->q.qlen--;
- sch->qstats.drops++;
+ len = qdisc_queue_drop(sch);
+
+ if (!len) {
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (p) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= len;
+ kfree_skb(skb);
+ }
}
+ if (!len && q->qdisc && q->qdisc->ops->drop)
+ len = q->qdisc->ops->drop(q->qdisc);
+ if (len)
+ sch->qstats.drops++;
+
return len;
}
@@ -242,145 +566,297 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
+ struct rb_node *p;
+
+ if (qdisc_is_throttled(sch))
+ return NULL;
- skb = q->qdisc->dequeue(q->qdisc);
+tfifo_dequeue:
+ skb = __skb_dequeue(&sch->q);
if (skb) {
- const struct netem_skb_cb *cb
- = (const struct netem_skb_cb *)skb->cb;
- psched_time_t now;
- long delay;
+deliver:
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ psched_time_t time_to_send;
+
+ skb = netem_rb_to_skb(p);
/* if more time remaining? */
- PSCHED_GET_TIME(now);
- delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
- pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
- if (delay <= 0) {
- pr_debug("netem_dequeue: return skb=%p\n", skb);
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (time_to_send <= psched_get_time()) {
+ rb_erase(p, &q->t_root);
+
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
- return skb;
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+
+#ifdef CONFIG_NET_CLS_ACT
+ /*
+ * If it's at ingress let's pretend the delay is
+ * from the network (tstamp will be updated).
+ */
+ if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+ skb->tstamp.tv64 = 0;
+#endif
+
+ if (q->qdisc) {
+ int err = qdisc_enqueue(skb, q->qdisc);
+
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ sch->qstats.drops++;
+ qdisc_tree_decrease_qlen(sch, 1);
+ }
+ }
+ goto tfifo_dequeue;
+ }
+ goto deliver;
}
- mod_timer(&q->timer, jiffies + delay);
- sch->flags |= TCQ_F_THROTTLED;
-
- if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
- sch->qstats.drops++;
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+ qdisc_watchdog_schedule(&q->watchdog, time_to_send);
}
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
return NULL;
}
-static void netem_watchdog(unsigned long arg)
-{
- struct Qdisc *sch = (struct Qdisc *)arg;
-
- pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
- sch->flags &= ~TCQ_F_THROTTLED;
- netif_schedule(sch->dev);
-}
-
static void netem_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- qdisc_reset(q->qdisc);
- sch->q.qlen = 0;
- sch->flags &= ~TCQ_F_THROTTLED;
- del_timer_sync(&q->timer);
+ qdisc_reset_queue(sch);
+ tfifo_reset(sch);
+ if (q->qdisc)
+ qdisc_reset(q->qdisc);
+ qdisc_watchdog_cancel(&q->watchdog);
}
-static int set_fifo_limit(struct Qdisc *q, int limit)
+static void dist_free(struct disttable *d)
{
- struct rtattr *rta;
- int ret = -ENOMEM;
-
- rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
- if (rta) {
- rta->rta_type = RTM_NEWQDISC;
- rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
- ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
-
- ret = q->ops->change(q, rta);
- kfree(rta);
- }
- return ret;
+ kvfree(d);
}
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
*/
-static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
+static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
- const __s16 *data = RTA_DATA(attr);
+ size_t n = nla_len(attr)/sizeof(__s16);
+ const __s16 *data = nla_data(attr);
+ spinlock_t *root_lock;
struct disttable *d;
int i;
+ size_t s;
- if (n > 65536)
+ if (n > NETEM_DIST_MAX)
return -EINVAL;
- d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+ s = sizeof(struct disttable) + n * sizeof(s16);
+ d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
+ if (!d)
+ d = vmalloc(s);
if (!d)
return -ENOMEM;
d->size = n;
for (i = 0; i < n; i++)
d->table[i] = data[i];
-
- spin_lock_bh(&sch->dev->queue_lock);
- d = xchg(&q->delay_dist, d);
- spin_unlock_bh(&sch->dev->queue_lock);
- kfree(d);
+ root_lock = qdisc_root_sleeping_lock(sch);
+
+ spin_lock_bh(root_lock);
+ swap(q->delay_dist, d);
+ spin_unlock_bh(root_lock);
+
+ dist_free(d);
return 0;
}
-static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
- const struct tc_netem_corr *c = RTA_DATA(attr);
-
- if (RTA_PAYLOAD(attr) != sizeof(*c))
- return -EINVAL;
+ const struct tc_netem_corr *c = nla_data(attr);
init_crandom(&q->delay_cor, c->delay_corr);
init_crandom(&q->loss_cor, c->loss_corr);
init_crandom(&q->dup_cor, c->dup_corr);
+}
+
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_reorder *r = nla_data(attr);
+
+ q->reorder = r->probability;
+ init_crandom(&q->reorder_cor, r->correlation);
+}
+
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_corrupt *r = nla_data(attr);
+
+ q->corrupt = r->probability;
+ init_crandom(&q->corrupt_cor, r->correlation);
+}
+
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_rate *r = nla_data(attr);
+
+ q->rate = r->rate;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ q->cell_overhead = r->cell_overhead;
+ if (q->cell_size)
+ q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+ else
+ q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
+}
+
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch (type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = TX_IN_GAP_PERIOD;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect ge model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = GOOD_STATE;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
+static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
+ [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
+ [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
+ [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
+ [TCA_NETEM_ECN] = { .type = NLA_U32 },
+ [TCA_NETEM_RATE64] = { .type = NLA_U64 },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy, int len)
{
- struct netem_sched_data *q = qdisc_priv(sch);
- const struct tc_netem_reorder *r = RTA_DATA(attr);
+ int nested_len = nla_len(nla) - NLA_ALIGN(len);
- if (RTA_PAYLOAD(attr) != sizeof(*r))
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
return -EINVAL;
+ }
- q->reorder = r->probability;
- init_crandom(&q->reorder_cor, r->correlation);
+ if (nested_len >= nla_attr_size(0))
+ return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy);
+
+ memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
-static int netem_change(struct Qdisc *sch, struct rtattr *opt)
+/* Parse netlink message to set options */
+static int netem_change(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_NETEM_MAX + 1];
struct tc_netem_qopt *qopt;
+ struct clgstate old_clg;
+ int old_loss_model = CLG_RANDOM;
int ret;
-
- if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+
+ if (opt == NULL)
return -EINVAL;
- qopt = RTA_DATA(opt);
- ret = set_fifo_limit(q->qdisc, qopt->limit);
- if (ret) {
- pr_debug("netem: can't set fifo limit\n");
+ qopt = nla_data(opt);
+ ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
+ if (ret < 0)
return ret;
+
+ /* backup q->clg and q->loss_model */
+ old_clg = q->clg;
+ old_loss_model = q->loss_model;
+
+ if (tb[TCA_NETEM_LOSS]) {
+ ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+ if (ret) {
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ } else {
+ q->loss_model = CLG_RANDOM;
+ }
+
+ if (tb[TCA_NETEM_DELAY_DIST]) {
+ ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+ if (ret) {
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
+ }
}
-
+
+ sch->limit = qopt->limit;
+
q->latency = qopt->latency;
q->jitter = qopt->jitter;
q->limit = qopt->limit;
@@ -389,44 +865,35 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
q->loss = qopt->loss;
q->duplicate = qopt->duplicate;
- /* for compatiablity with earlier versions.
- * if gap is set, need to assume 100% probablity
+ /* for compatibility with earlier versions.
+ * if gap is set, need to assume 100% probability
*/
- q->reorder = ~0;
-
- /* Handle nested options after initial queue options.
- * Should have put all options in nested format but too late now.
- */
- if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
- struct rtattr *tb[TCA_NETEM_MAX];
- if (rtattr_parse(tb, TCA_NETEM_MAX,
- RTA_DATA(opt) + sizeof(*qopt),
- RTA_PAYLOAD(opt) - sizeof(*qopt)))
- return -EINVAL;
+ if (q->gap)
+ q->reorder = ~0;
- if (tb[TCA_NETEM_CORR-1]) {
- ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
- if (ret)
- return ret;
- }
+ if (tb[TCA_NETEM_CORR])
+ get_correlation(q, tb[TCA_NETEM_CORR]);
- if (tb[TCA_NETEM_DELAY_DIST-1]) {
- ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
- if (ret)
- return ret;
- }
- if (tb[TCA_NETEM_REORDER-1]) {
- ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
- if (ret)
- return ret;
- }
- }
+ if (tb[TCA_NETEM_REORDER])
+ get_reorder(q, tb[TCA_NETEM_REORDER]);
+ if (tb[TCA_NETEM_CORRUPT])
+ get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
- return 0;
+ if (tb[TCA_NETEM_RATE])
+ get_rate(q, tb[TCA_NETEM_RATE]);
+
+ if (tb[TCA_NETEM_RATE64])
+ q->rate = max_t(u64, q->rate,
+ nla_get_u64(tb[TCA_NETEM_RATE64]));
+
+ if (tb[TCA_NETEM_ECN])
+ q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
+
+ return ret;
}
-static int netem_init(struct Qdisc *sch, struct rtattr *opt)
+static int netem_init(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
int ret;
@@ -434,21 +901,12 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
if (!opt)
return -EINVAL;
- init_timer(&q->timer);
- q->timer.function = netem_watchdog;
- q->timer.data = (unsigned long) sch;
-
- q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
- if (!q->qdisc) {
- pr_debug("netem: qdisc create failed\n");
- return -ENOMEM;
- }
+ qdisc_watchdog_init(&q->watchdog, sch);
+ q->loss_model = CLG_RANDOM;
ret = netem_change(sch, opt);
- if (ret) {
- pr_debug("netem: change failed\n");
- qdisc_destroy(q->qdisc);
- }
+ if (ret)
+ pr_info("netem: change failed\n");
return ret;
}
@@ -456,19 +914,71 @@ static void netem_destroy(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- del_timer_sync(&q->timer);
- qdisc_destroy(q->qdisc);
- kfree(q->delay_dist);
+ qdisc_watchdog_cancel(&q->watchdog);
+ if (q->qdisc)
+ qdisc_destroy(q->qdisc);
+ dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+ goto nla_put_failure;
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+ goto nla_put_failure;
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb->tail;
- struct rtattr *rta = (struct rtattr *) b;
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
+ struct tc_netem_corrupt corrupt;
+ struct tc_netem_rate rate;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -476,23 +986,48 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
qopt.loss = q->loss;
qopt.gap = q->gap;
qopt.duplicate = q->duplicate;
- RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
cor.delay_corr = q->delay_cor.rho;
cor.loss_corr = q->loss_cor.rho;
cor.dup_corr = q->dup_cor.rho;
- RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+ if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+ goto nla_put_failure;
reorder.probability = q->reorder;
reorder.correlation = q->reorder_cor.rho;
- RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+ if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+ goto nla_put_failure;
+
+ corrupt.probability = q->corrupt;
+ corrupt.correlation = q->corrupt_cor.rho;
+ if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+ goto nla_put_failure;
+
+ if (q->rate >= (1ULL << 32)) {
+ if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+ goto nla_put_failure;
+ rate.rate = ~0U;
+ } else {
+ rate.rate = q->rate;
+ }
+ rate.packet_overhead = q->packet_overhead;
+ rate.cell_size = q->cell_size;
+ rate.cell_overhead = q->cell_overhead;
+ if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+ goto nla_put_failure;
- rta->rta_len = skb->tail - b;
+ if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+ goto nla_put_failure;
- return skb->len;
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
-rtattr_failure:
- skb_trim(skb, b - skb->data);
+ return nla_nest_end(skb, nla);
+
+nla_put_failure:
+ nlmsg_trim(skb, nla);
return -1;
}
@@ -501,7 +1036,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
{
struct netem_sched_data *q = qdisc_priv(sch);
- if (cl != 1) /* only one class */
+ if (cl != 1 || !q->qdisc) /* only one class */
return -ENOENT;
tcm->tcm_handle |= TC_H_MIN(1);
@@ -515,13 +1050,13 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
{
struct netem_sched_data *q = qdisc_priv(sch);
- if (new == NULL)
- new = &noop_qdisc;
-
sch_tree_lock(sch);
- *old = xchg(&q->qdisc, new);
- qdisc_reset(*old);
- sch->q.qlen = 0;
+ *old = q->qdisc;
+ q->qdisc = new;
+ if (*old) {
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ }
sch_tree_unlock(sch);
return 0;
@@ -542,17 +1077,6 @@ static void netem_put(struct Qdisc *sch, unsigned long arg)
{
}
-static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
- struct rtattr **tca, unsigned long *arg)
-{
- return -ENOSYS;
-}
-
-static int netem_delete(struct Qdisc *sch, unsigned long arg)
-{
- return -ENOSYS;
-}
-
static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
@@ -565,30 +1089,22 @@ static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
}
}
-static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
-{
- return NULL;
-}
-
-static struct Qdisc_class_ops netem_class_ops = {
+static const struct Qdisc_class_ops netem_class_ops = {
.graft = netem_graft,
.leaf = netem_leaf,
.get = netem_get,
.put = netem_put,
- .change = netem_change_class,
- .delete = netem_delete,
.walk = netem_walk,
- .tcf_chain = netem_find_tcf,
.dump = netem_dump_class,
};
-static struct Qdisc_ops netem_qdisc_ops = {
+static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
.cl_ops = &netem_class_ops,
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
- .requeue = netem_requeue,
+ .peek = qdisc_peek_dequeued,
.drop = netem_drop,
.init = netem_init,
.reset = netem_reset,
@@ -601,6 +1117,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
static int __init netem_module_init(void)
{
+ pr_info("netem: version " VERSION "\n");
return register_qdisc(&netem_qdisc_ops);
}
static void __exit netem_module_exit(void)