aboutsummaryrefslogtreecommitdiff
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Kconfig8
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c617
-rw-r--r--net/dccp/ackvec.h161
-rw-r--r--net/dccp/ccid.c7
-rw-r--r--net/dccp/ccid.h108
-rw-r--r--net/dccp/ccids/Kconfig36
-rw-r--r--net/dccp/ccids/ccid2.c594
-rw-r--r--net/dccp/ccids/ccid2.h73
-rw-r--r--net/dccp/ccids/ccid3.c298
-rw-r--r--net/dccp/ccids/ccid3.h51
-rw-r--r--net/dccp/ccids/lib/loss_interval.c3
-rw-r--r--net/dccp/ccids/lib/loss_interval.h8
-rw-r--r--net/dccp/ccids/lib/packet_history.c42
-rw-r--r--net/dccp/ccids/lib/packet_history.h47
-rw-r--r--net/dccp/ccids/lib/tfrc.c3
-rw-r--r--net/dccp/ccids/lib/tfrc.h23
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c16
-rw-r--r--net/dccp/dccp.h265
-rw-r--r--net/dccp/diag.c20
-rw-r--r--net/dccp/feat.c233
-rw-r--r--net/dccp/feat.h26
-rw-r--r--net/dccp/input.c159
-rw-r--r--net/dccp/ipv4.c175
-rw-r--r--net/dccp/ipv6.c357
-rw-r--r--net/dccp/ipv6.h2
-rw-r--r--net/dccp/minisocks.c67
-rw-r--r--net/dccp/options.c184
-rw-r--r--net/dccp/output.c293
-rw-r--r--net/dccp/probe.c17
-rw-r--r--net/dccp/proto.c179
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c18
-rw-r--r--net/dccp/timer.c34
34 files changed, 2334 insertions, 1931 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index ad6dffd9070..8c0ef71bed2 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -1,6 +1,6 @@
menuconfig IP_DCCP
- tristate "The DCCP Protocol (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ tristate "The DCCP Protocol"
+ depends on INET
---help---
Datagram Congestion Control Protocol (RFC 4340)
@@ -49,7 +49,9 @@ config NET_DCCPPROBE
what was just said, you don't need it: say N.
Documentation on how to use DCCP connection probing can be found
- at http://linux-net.osdl.org/index.php/DccpProbe
+ at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
To compile this code as a module, choose M here: the
module will be called dccp_probe.
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2991efcc8de..5c8362b037e 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
-
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+ qpolicy.o
#
# CCID algorithms to be used by dccp.ko
#
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 01e4d39fa23..ba07824af4c 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,444 +1,376 @@
/*
* net/dccp/ackvec.c
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License;
*/
-
-#include "ackvec.h"
#include "dccp.h"
-
-#include <linux/init.h>
-#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/skbuff.h>
#include <linux/slab.h>
-
-#include <net/sock.h>
+#include <linux/export.h>
static struct kmem_cache *dccp_ackvec_slab;
static struct kmem_cache *dccp_ackvec_record_slab;
-static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
{
- struct dccp_ackvec_record *avr =
- kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+ struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
- if (avr != NULL)
- INIT_LIST_HEAD(&avr->avr_node);
-
- return avr;
+ if (av != NULL) {
+ av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
+ INIT_LIST_HEAD(&av->av_records);
+ }
+ return av;
}
-static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
{
- if (unlikely(avr == NULL))
- return;
- /* Check if deleting a linked record */
- WARN_ON(!list_empty(&avr->avr_node));
- kmem_cache_free(dccp_ackvec_record_slab, avr);
+ struct dccp_ackvec_record *cur, *next;
+
+ list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+ kmem_cache_free(dccp_ackvec_record_slab, cur);
+ INIT_LIST_HEAD(&av->av_records);
}
-static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
+void dccp_ackvec_free(struct dccp_ackvec *av)
{
- /*
- * AVRs are sorted by seqno. Since we are sending them in order, we
- * just add the AVR at the head of the list.
- * -sorbo.
- */
- if (!list_empty(&av->av_records)) {
- const struct dccp_ackvec_record *head =
- list_entry(av->av_records.next,
- struct dccp_ackvec_record,
- avr_node);
- BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
+ if (likely(av != NULL)) {
+ dccp_ackvec_purge_records(av);
+ kmem_cache_free(dccp_ackvec_slab, av);
}
-
- list_add(&avr->avr_node, &av->av_records);
}
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_ackvec_update_records - Record information about sent Ack Vectors
+ * @av: Ack Vector records to update
+ * @seqno: Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- /* Figure out how many options do we need to represent the ackvec */
- const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
- u16 len = av->av_vec_len + 2 * nr_opts, i;
- u32 elapsed_time;
- const unsigned char *tail, *from;
- unsigned char *to;
struct dccp_ackvec_record *avr;
- suseconds_t delta;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- delta = ktime_us_delta(ktime_get_real(), av->av_time);
- elapsed_time = delta / 10;
- if (elapsed_time != 0 &&
- dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
- return -1;
-
- avr = dccp_ackvec_record_new();
+ avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
if (avr == NULL)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- len = av->av_vec_len;
- from = av->av_buf + av->av_buf_head;
- tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
-
- for (i = 0; i < nr_opts; ++i) {
- int copylen = len;
-
- if (len > DCCP_SINGLE_OPT_MAXLEN)
- copylen = DCCP_SINGLE_OPT_MAXLEN;
-
- *to++ = DCCPO_ACK_VECTOR_0;
- *to++ = copylen + 2;
-
- /* Check if buf_head wraps */
- if (from + copylen > tail) {
- const u16 tailsize = tail - from;
-
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- copylen -= tailsize;
- from = av->av_buf;
- }
-
- memcpy(to, from, copylen);
- from += copylen;
- to += copylen;
- len -= copylen;
- }
+ return -ENOBUFS;
+ avr->avr_ack_seqno = seqno;
+ avr->avr_ack_ptr = av->av_buf_head;
+ avr->avr_ack_ackno = av->av_buf_ackno;
+ avr->avr_ack_nonce = nonce_sum;
+ avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
/*
- * From RFC 4340, A.2:
- *
- * For each acknowledgement it sends, the HC-Receiver will add an
- * acknowledgement record. ack_seqno will equal the HC-Receiver
- * sequence number it used for the ack packet; ack_ptr will equal
- * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
- * equal buf_nonce.
+ * When the buffer overflows, we keep no more than one record. This is
+ * the simplest way of disambiguating sender-Acks dating from before the
+ * overflow from sender-Acks which refer to after the overflow; a simple
+ * solution is preferable here since we are handling an exception.
*/
- avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- avr->avr_ack_ptr = av->av_buf_head;
- avr->avr_ack_ackno = av->av_buf_ackno;
- avr->avr_ack_nonce = av->av_buf_nonce;
- avr->avr_sent_len = av->av_vec_len;
-
- dccp_ackvec_insert_avr(av, avr);
+ if (av->av_overflow)
+ dccp_ackvec_purge_records(av);
+ /*
+ * Since GSS is incremented for each packet, the list is automatically
+ * arranged in descending order of @ack_seqno.
+ */
+ list_add(&avr->avr_node, &av->av_records);
- dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu\n",
- dccp_role(sk), avr->avr_sent_len,
+ dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
(unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno);
+ (unsigned long long)avr->avr_ack_ackno,
+ avr->avr_ack_runlen);
return 0;
}
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+ const u64 ackno)
{
- struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
-
- if (av != NULL) {
- av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1;
- av->av_buf_ackno = UINT48_MAX + 1;
- av->av_buf_nonce = 0;
- av->av_time = ktime_set(0, 0);
- av->av_vec_len = 0;
- INIT_LIST_HEAD(&av->av_records);
+ struct dccp_ackvec_record *avr;
+ /*
+ * Exploit that records are inserted in descending order of sequence
+ * number, start with the oldest record first. If @ackno is `before'
+ * the earliest ack_ackno, the packet is too old to be considered.
+ */
+ list_for_each_entry_reverse(avr, av_list, avr_node) {
+ if (avr->avr_ack_seqno == ackno)
+ return avr;
+ if (before48(ackno, avr->avr_ack_seqno))
+ break;
}
-
- return av;
+ return NULL;
}
-void dccp_ackvec_free(struct dccp_ackvec *av)
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
{
- if (unlikely(av == NULL))
- return;
-
- if (!list_empty(&av->av_records)) {
- struct dccp_ackvec_record *avr, *next;
-
- list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
- list_del_init(&avr->avr_node);
- dccp_ackvec_record_delete(avr);
- }
- }
-
- kmem_cache_free(dccp_ackvec_slab, av);
+ return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
}
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const u32 index)
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
{
- return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
+ return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
}
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const u32 index)
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
{
- return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
+ if (unlikely(av->av_overflow))
+ return DCCPAV_MAX_ACKVEC_LEN;
+ return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
}
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
+/**
+ * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
+ * @av: non-empty buffer to update
+ * @distance: negative or zero distance of @seqno from buf_ackno downward
+ * @seqno: the (old) sequence number whose record is to be updated
+ * @state: state in which packet carrying @seqno was received
*/
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
- const unsigned int packets,
- const unsigned char state)
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+ u64 seqno, enum dccp_ackvec_states state)
{
- unsigned int gap;
- long new_head;
+ u16 ptr = av->av_buf_head;
- if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
- return -ENOBUFS;
+ BUG_ON(distance > 0);
+ if (unlikely(dccp_ackvec_is_empty(av)))
+ return;
- gap = packets - 1;
- new_head = av->av_buf_head - packets;
+ do {
+ u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
- if (new_head < 0) {
- if (gap > 0) {
- memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
- gap + new_head + 1);
- gap = -new_head;
+ if (distance + runlen >= 0) {
+ /*
+ * Only update the state if packet has not been received
+ * yet. This is OK as per the second table in RFC 4340,
+ * 11.4.1; i.e. here we are using the following table:
+ * RECEIVED
+ * 0 1 3
+ * S +---+---+---+
+ * T 0 | 0 | 0 | 0 |
+ * O +---+---+---+
+ * R 1 | 1 | 1 | 1 |
+ * E +---+---+---+
+ * D 3 | 0 | 1 | 3 |
+ * +---+---+---+
+ * The "Not Received" state was set by reserve_seats().
+ */
+ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+ av->av_buf[ptr] = state;
+ else
+ dccp_pr_debug("Not changing %llu state to %u\n",
+ (unsigned long long)seqno, state);
+ break;
}
- new_head += DCCP_MAX_ACKVEC_LEN;
- }
- av->av_buf_head = new_head;
+ distance += runlen + 1;
+ ptr = __ackvec_idx_add(ptr, 1);
- if (gap > 0)
- memset(av->av_buf + av->av_buf_head + 1,
- DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+ } while (ptr != av->av_buf_tail);
+}
- av->av_buf[av->av_buf_head] = state;
- av->av_vec_len += packets;
- return 0;
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+ u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+ len = DCCPAV_MAX_ACKVEC_LEN - start;
+
+ /* check for buffer wrap-around */
+ if (num > len) {
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+ start = 0;
+ num -= len;
+ }
+ if (num)
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
}
-/*
- * Implements the RFC 4340, Appendix A
+/**
+ * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
+ * @av: container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno: sequence number of the first packet in @num_packets
+ * @state: state in which packet carrying @seqno was received
*/
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+ u64 seqno, enum dccp_ackvec_states state)
{
- /*
- * Check at the right places if the buffer is full, if it is, tell the
- * caller to start dropping packets till the HC-Sender acks our ACK
- * vectors, when we will free up space in av_buf.
- *
- * We may well decide to do buffer compression, etc, but for now lets
- * just drop.
- *
- * From Appendix A.1.1 (`New Packets'):
- *
- * Of course, the circular buffer may overflow, either when the
- * HC-Sender is sending data at a very high rate, when the
- * HC-Receiver's acknowledgements are not reaching the HC-Sender,
- * or when the HC-Sender is forgetting to acknowledge those acks
- * (so the HC-Receiver is unable to clean up old state). In this
- * case, the HC-Receiver should either compress the buffer (by
- * increasing run lengths when possible), transfer its state to
- * a larger buffer, or, as a last resort, drop all received
- * packets, without processing them whatsoever, until its buffer
- * shrinks again.
- */
+ u32 num_cells = num_packets;
- /* See if this is the first ackno being inserted */
- if (av->av_vec_len == 0) {
- av->av_buf[av->av_buf_head] = state;
- av->av_vec_len = 1;
- } else if (after48(ackno, av->av_buf_ackno)) {
- const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
+ if (num_packets > DCCPAV_BURST_THRESH) {
+ u32 lost_packets = num_packets - 1;
+ DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
/*
- * Look if the state of this packet is the same as the
- * previous ackno and if so if we can bump the head len.
+ * We received 1 packet and have a loss of size "num_packets-1"
+ * which we squeeze into num_cells-1 rather than reserving an
+ * entire byte for each lost packet.
+ * The reason is that the vector grows in O(burst_length); when
+ * it grows too large there will no room left for the payload.
+ * This is a trade-off: if a few packets out of the burst show
+ * up later, their state will not be changed; it is simply too
+ * costly to reshuffle/reallocate/copy the buffer each time.
+ * Should such problems persist, we will need to switch to a
+ * different underlying data structure.
*/
- if (delta == 1 &&
- dccp_ackvec_state(av, av->av_buf_head) == state &&
- dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
- av->av_buf[av->av_buf_head]++;
- else if (dccp_ackvec_set_buf_head_state(av, delta, state))
- return -ENOBUFS;
- } else {
- /*
- * A.1.2. Old Packets
- *
- * When a packet with Sequence Number S <= buf_ackno
- * arrives, the HC-Receiver will scan the table for
- * the byte corresponding to S. (Indexing structures
- * could reduce the complexity of this scan.)
- */
- u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
- u32 index = av->av_buf_head;
+ for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+ u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
- while (1) {
- const u8 len = dccp_ackvec_len(av, index);
- const u8 av_state = dccp_ackvec_state(av, index);
- /*
- * valid packets not yet in av_buf have a reserved
- * entry, with a len equal to 0.
- */
- if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
- len == 0 && delta == 0) { /* Found our
- reserved seat! */
- dccp_pr_debug("Found %llu reserved seat!\n",
- (unsigned long long)ackno);
- av->av_buf[index] = state;
- goto out;
- }
- /* len == 0 means one packet */
- if (delta < len + 1)
- goto out_duplicate;
-
- delta -= len + 1;
- if (++index == DCCP_MAX_ACKVEC_LEN)
- index = 0;
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+ av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+ lost_packets -= len;
}
}
- av->av_buf_ackno = ackno;
- av->av_time = ktime_get_real();
-out:
- return 0;
+ if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+ DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+ av->av_overflow = true;
+ }
+
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+ if (av->av_overflow)
+ av->av_buf_tail = av->av_buf_head;
-out_duplicate:
- /* Duplicate packet */
- dccp_pr_debug("Received a dup or already considered lost "
- "packet: %llu\n", (unsigned long long)ackno);
- return -EILSEQ;
+ av->av_buf[av->av_buf_head] = state;
+ av->av_buf_ackno = seqno;
+
+ if (num_packets > 1)
+ dccp_ackvec_reserve_seats(av, num_packets - 1);
}
-static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
+/**
+ * dccp_ackvec_input - Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
- struct dccp_ackvec_record *next;
+ u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ enum dccp_ackvec_states state = DCCPAV_RECEIVED;
- /* sort out vector length */
- if (av->av_buf_head <= avr->avr_ack_ptr)
- av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
- else
- av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
- av->av_buf_head + avr->avr_ack_ptr;
+ if (dccp_ackvec_is_empty(av)) {
+ dccp_ackvec_add_new(av, 1, seqno, state);
+ av->av_tail_ackno = seqno;
- /* free records */
- list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
- list_del_init(&avr->avr_node);
- dccp_ackvec_record_delete(avr);
- }
-}
+ } else {
+ s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+ u8 *current_head = av->av_buf + av->av_buf_head;
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
+ if (num_packets == 1 &&
+ dccp_ackvec_state(current_head) == state &&
+ dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
- /*
- * If we traverse backwards, it should be faster when we have large
- * windows. We will be receiving ACKs for stuff we sent a while back
- * -sorbo.
- */
- list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
- if (ackno == avr->avr_ack_seqno) {
- dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu, ACKED!\n",
- dccp_role(sk), 1,
- (unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- } else if (avr->avr_ack_seqno > ackno)
- break; /* old news */
+ *current_head += 1;
+ av->av_buf_ackno = seqno;
+
+ } else if (num_packets > 0) {
+ dccp_ackvec_add_new(av, num_packets, seqno, state);
+ } else {
+ dccp_ackvec_update_old(av, num_packets, seqno, state);
+ }
}
}
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
- struct sock *sk, u64 *ackno,
- const unsigned char len,
- const unsigned char *vector)
+/**
+ * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
{
- unsigned char i;
- struct dccp_ackvec_record *avr;
+ struct dccp_ackvec_record *avr, *next;
+ u8 runlen_now, eff_runlen;
+ s64 delta;
- /* Check if we actually sent an ACK vector */
- if (list_empty(&av->av_records))
+ avr = dccp_ackvec_lookup(&av->av_records, ackno);
+ if (avr == NULL)
return;
+ /*
+ * Deal with outdated acknowledgments: this arises when e.g. there are
+ * several old records and the acks from the peer come in slowly. In
+ * that case we may still have records that pre-date tail_ackno.
+ */
+ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+ if (delta < 0)
+ goto free_records;
+ /*
+ * Deal with overlapping Ack Vectors: don't subtract more than the
+ * number of packets between tail_ackno and ack_ackno.
+ */
+ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
- i = len;
+ runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
/*
- * XXX
- * I think it might be more efficient to work backwards. See comment on
- * rcv_ackno. -sorbo.
+ * The run length of Ack Vector cells does not decrease over time. If
+ * the run length is the same as at the time the Ack Vector was sent, we
+ * free the ack_ptr cell. That cell can however not be freed if the run
+ * length has increased: in this case we need to move the tail pointer
+ * backwards (towards higher indices), to its next-oldest neighbour.
*/
- avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
- while (i--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl;
+ if (runlen_now > eff_runlen) {
- dccp_set_seqno(&ackno_end_rl, *ackno - rl);
+ av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+ av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+ /* This move may not have cleared the overflow flag. */
+ if (av->av_overflow)
+ av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+ } else {
+ av->av_buf_tail = avr->avr_ack_ptr;
/*
- * If our AVR sequence number is greater than the ack, go
- * forward in the AVR list until it is not so.
+ * We have made sure that avr points to a valid cell within the
+ * buffer. This cell is either older than head, or equals head
+ * (empty buffer): in both cases we no longer have any overflow.
*/
- list_for_each_entry_from(avr, &av->av_records, avr_node) {
- if (!after48(avr->avr_ack_seqno, *ackno))
- goto found;
- }
- /* End of the av_records list, not found, exit */
- break;
-found:
- if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
- const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
- dccp_pr_debug("%s ACK vector 0, len=%d, "
- "ack_seqno=%llu, ack_ackno=%llu, "
- "ACKED!\n",
- dccp_role(sk), len,
- (unsigned long long)
- avr->avr_ack_seqno,
- (unsigned long long)
- avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- }
- /*
- * If it wasn't received, continue scanning... we might
- * find another one.
- */
- }
+ av->av_overflow = 0;
+ }
- dccp_set_seqno(ackno, ackno_end_rl - 1);
- ++vector;
+ /*
+ * The peer has acknowledged up to and including ack_ackno. Hence the
+ * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+ */
+ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+ list_del(&avr->avr_node);
+ kmem_cache_free(dccp_ackvec_record_slab, avr);
}
}
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt, const u8 *value, const u8 len)
+/*
+ * Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
{
- if (len > DCCP_SINGLE_OPT_MAXLEN)
- return -1;
+ struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+ if (new == NULL)
+ return -ENOBUFS;
+ new->vec = vec;
+ new->len = len;
+ new->nonce = nonce;
- /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
- dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
- ackno, len, value);
+ list_add_tail(&new->node, head);
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+ struct dccp_ackvec_parsed *cur, *next;
+
+ list_for_each_entry_safe(cur, next, parsed_chunks, node)
+ kfree(cur);
+ INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
int __init dccp_ackvec_init(void)
{
@@ -448,10 +380,9 @@ int __init dccp_ackvec_init(void)
if (dccp_ackvec_slab == NULL)
goto out_err;
- dccp_ackvec_record_slab =
- kmem_cache_create("dccp_ackvec_record",
- sizeof(struct dccp_ackvec_record),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+ sizeof(struct dccp_ackvec_record),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (dccp_ackvec_record_slab == NULL)
goto out_destroy_slab;
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 7ea557b7c6b..3284bfa988c 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,9 +3,9 @@
/*
* net/dccp/ackvec.h
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -13,99 +13,126 @@
#include <linux/dccp.h>
#include <linux/compiler.h>
-#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/types.h>
-/* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
+ */
+#define DCCPAV_NUM_ACKVECS 2
+#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
/* Estimated minimum average Ack Vector length - used for updating MPS */
#define DCCPAV_MIN_OPTLEN 16
-#define DCCP_ACKVEC_STATE_RECEIVED 0
-#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
-#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
+enum dccp_ackvec_states {
+ DCCPAV_RECEIVED = 0x00,
+ DCCPAV_ECN_MARKED = 0x40,
+ DCCPAV_RESERVED = 0x80,
+ DCCPAV_NOT_RECEIVED = 0xC0
+};
+#define DCCPAV_MAX_RUNLEN 0x3F
-/** struct dccp_ackvec - ack vector
- *
- * This data structure is the one defined in RFC 4340, Appendix A.
- *
- * @av_buf_head - circular buffer head
- * @av_buf_tail - circular buffer tail
- * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
- * buffer (i.e. %av_buf_head)
- * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * by the buffer with State 0
- *
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+ return *cell & DCCPAV_MAX_RUNLEN;
+}
+
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+ return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/**
+ * struct dccp_ackvec - Ack Vector main data structure
*
- * @av_records - list of dccp_ackvec_record
- * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
*
- * @av_time - the time in usecs
- * @av_buf - circular buffer of acknowledgeable packets
+ * @av_buf: circular buffer storage area
+ * @av_buf_head: head index; begin of live portion in @av_buf
+ * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
+ * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
+ * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
*/
struct dccp_ackvec {
- u64 av_buf_ackno;
- struct list_head av_records;
- ktime_t av_time;
+ u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
u16 av_buf_head;
- u16 av_vec_len;
- u8 av_buf_nonce;
- u8 av_ack_nonce;
- u8 av_buf[DCCP_MAX_ACKVEC_LEN];
+ u16 av_buf_tail;
+ u64 av_buf_ackno:48;
+ u64 av_tail_ackno:48;
+ bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
+ u8 av_overflow:1;
+ struct list_head av_records;
};
-/** struct dccp_ackvec_record - ack vector record
+/**
+ * struct dccp_ackvec_record - Records information about sent Ack Vectors
*
- * ACK vector record as defined in Appendix A of spec.
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
*
- * The list is sorted by avr_ack_seqno
+ * @avr_node: the list node in @av_records
+ * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr: pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
*
- * @avr_node - node in av_records
- * @avr_ack_seqno - sequence number of the packet this record was sent on
- * @avr_ack_ackno - sequence number being acknowledged
- * @avr_ack_ptr - pointer into av_buf where this record starts
- * @avr_ack_nonce - av_ack_nonce at the time this record was sent
- * @avr_sent_len - lenght of the record in av_buf
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
*/
struct dccp_ackvec_record {
struct list_head avr_node;
- u64 avr_ack_seqno;
- u64 avr_ack_ackno;
+ u64 avr_ack_seqno:48;
+ u64 avr_ack_ackno:48;
u16 avr_ack_ptr;
- u16 avr_sent_len;
- u8 avr_ack_nonce;
+ u8 avr_ack_runlen;
+ u8 avr_ack_nonce:1;
};
-struct sock;
-struct sk_buff;
-
-extern int dccp_ackvec_init(void);
-extern void dccp_ackvec_exit(void);
-
-extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
-extern void dccp_ackvec_free(struct dccp_ackvec *av);
-
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state);
+int dccp_ackvec_init(void);
+void dccp_ackvec_exit(void);
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt,
- const u8 *value, const u8 len);
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
+void dccp_ackvec_free(struct dccp_ackvec *av);
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
- return av->av_vec_len;
+ return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
+
+/**
+ * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
+ * @vec: start of vector (offset into skb)
+ * @len: length of @vec
+ * @nonce: whether @vec had an ECN nonce of 0 or 1
+ * @node: FIFO - arranged in descending order of ack_ackno
+ *
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+ u8 *vec,
+ len,
+ nonce:1;
+ struct list_head node;
+};
+
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 49d27c556be..597557254dd 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -11,6 +11,8 @@
* published by the Free Software Foundation.
*/
+#include <linux/slab.h>
+
#include "ccid.h"
#include "ccids/lib/tfrc.h"
@@ -44,6 +46,7 @@ bool ccid_support_check(u8 const *ccid_array, u8 array_len)
* ccid_get_builtin_ccids - Populate a list of built-in CCIDs
* @ccid_array: pointer to copy into
* @array_len: value to return length into
+ *
* This function allocates memory - caller must see that it is freed after use.
*/
int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
@@ -116,7 +119,7 @@ static int ccid_activate(struct ccid_operations *ccid_ops)
if (ccid_ops->ccid_hc_tx_slab == NULL)
goto out_free_rx_slab;
- pr_info("CCID: Activated CCID %d (%s)\n",
+ pr_info("DCCP: Activated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
err = 0;
out:
@@ -134,7 +137,7 @@ static void ccid_deactivate(struct ccid_operations *ccid_ops)
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
ccid_ops->ccid_hc_rx_slab = NULL;
- pr_info("CCID: Deactivated CCID %d (%s)\n",
+ pr_info("DCCP: Deactivated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6df6f8ac963..6eb837a47b5 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -62,22 +62,18 @@ struct ccid_operations {
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- int more, unsigned int len);
+ unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -97,8 +93,8 @@ extern struct ccid_operations ccid2_ops;
extern struct ccid_operations ccid3_ops;
#endif
-extern int ccid_initialize_builtins(void);
-extern void ccid_cleanup_builtins(void);
+int ccid_initialize_builtins(void);
+void ccid_cleanup_builtins(void);
struct ccid {
struct ccid_operations *ccid_ops;
@@ -110,12 +106,12 @@ static inline void *ccid_priv(const struct ccid *ccid)
return (void *)ccid->ccid_priv;
}
-extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
-extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
-extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
- char __user *, int __user *);
+bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *, int __user *);
-extern struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
{
@@ -135,23 +131,51 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
return ccid->ccid_ops->ccid_id;
}
-extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
-extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+ CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
+ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
+ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
+ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
+ CCID_PACKET_ERR = 0xF0000, /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+ if (return_code < 0)
+ return CCID_PACKET_ERR;
+ if (return_code == 0)
+ return CCID_PACKET_SEND_AT_ONCE;
+ if (return_code <= CCID_PACKET_DELAY_MAX)
+ return CCID_PACKET_DELAY;
+ return return_code;
+}
static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
- int rc = 0;
if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
- return rc;
+ return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+ return CCID_PACKET_SEND_AT_ONCE;
}
static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- int more, unsigned int len)
+ unsigned int len)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
+ ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -168,27 +192,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
+/**
+ * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
- value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
+/**
+ * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
@@ -218,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
@@ -229,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 8408398cd44..8ba3fc9d6d1 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,5 +1,4 @@
-menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+menu "DCCP CCIDs Configuration"
config IP_DCCP_CCID2_DEBUG
bool "CCID-2 debugging messages"
@@ -12,7 +11,7 @@ config IP_DCCP_CCID2_DEBUG
If in doubt, say N.
config IP_DCCP_CCID3
- bool "CCID-3 (TCP-Friendly) (EXPERIMENTAL)"
+ bool "CCID-3 (TCP-Friendly)"
def_bool y if (IP_DCCP = y || IP_DCCP = m)
---help---
CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
@@ -47,37 +46,6 @@ config IP_DCCP_CCID3_DEBUG
If in doubt, say N.
-config IP_DCCP_CCID3_RTO
- int "Use higher bound for nofeedback timer"
- default 100
- depends on IP_DCCP_CCID3 && EXPERIMENTAL
- ---help---
- Use higher lower bound for nofeedback timer expiration.
-
- The TFRC nofeedback timer normally expires after the maximum of 4
- RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
- with a small RTT this can mean a high processing load and reduced
- performance, since then the nofeedback timer is triggered very
- frequently.
-
- This option enables to set a higher lower bound for the nofeedback
- value. Values in units of milliseconds can be set here.
-
- A value of 0 disables this feature by enforcing the value specified
- in RFC 3448. The following values have been suggested as bounds for
- experimental use:
- * 16-20ms to match the typical multimedia inter-frame interval
- * 100ms as a reasonable compromise [default]
- * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
-
- The default of 100ms is a compromise between a large value for
- efficient DCCP implementations, and a small value to avoid disrupting
- the network in times of congestion.
-
- The purpose of the nofeedback timer is to slow DCCP down when there
- is serious network congestion: experimenting with larger values should
- therefore not be performed on WANs.
-
config IP_DCCP_TFRC_LIB
def_bool y if IP_DCCP_CCID3
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index a47a8c918ee..f053198e730 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -23,60 +23,16 @@
/*
* This implementation should follow RFC 4341
*/
+#include <linux/slab.h>
#include "../feat.h"
-#include "../ccid.h"
-#include "../dccp.h"
#include "ccid2.h"
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-static int ccid2_debug;
+static bool ccid2_debug;
#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc)
-{
- int len = 0;
- int pipe = 0;
- struct ccid2_seq *seqp = hc->tx_seqh;
-
- /* there is data in the chain */
- if (seqp != hc->tx_seqt) {
- seqp = seqp->ccid2s_prev;
- len++;
- if (!seqp->ccid2s_acked)
- pipe++;
-
- while (seqp != hc->tx_seqt) {
- struct ccid2_seq *prev = seqp->ccid2s_prev;
-
- len++;
- if (!prev->ccid2s_acked)
- pipe++;
-
- /* packets are sent sequentially */
- BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
- prev->ccid2s_seq ) >= 0);
- BUG_ON(time_before(seqp->ccid2s_sent,
- prev->ccid2s_sent));
-
- seqp = prev;
- }
- }
-
- BUG_ON(pipe != hc->tx_pipe);
- ccid2_pr_debug("len of chain=%d\n", len);
-
- do {
- seqp = seqp->ccid2s_prev;
- len++;
- } while (seqp != hc->tx_seqh);
-
- ccid2_pr_debug("total len=%d\n", len);
- BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN);
-}
#else
#define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hc)
#endif
static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
@@ -122,17 +78,13 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if (hc->tx_pipe < hc->tx_cwnd)
- return 0;
-
- return 1; /* XXX CCID should dequeue when ready instead of polling */
+ if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+ return CCID_PACKET_WILL_DEQUEUE_LATER;
+ return CCID_PACKET_SEND_AT_ONCE;
}
static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
{
- struct dccp_sock *dp = dccp_sk(sk);
u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
/*
@@ -145,29 +97,40 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
val = max_ratio;
}
- if (val > DCCPF_ACK_RATIO_MAX)
- val = DCCPF_ACK_RATIO_MAX;
+ dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
+ min_t(u32, val, DCCPF_ACK_RATIO_MAX));
+}
- if (val == dp->dccps_l_ack_ratio)
- return;
+static void ccid2_check_l_ack_ratio(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- ccid2_pr_debug("changing local ack ratio to %u\n", val);
- dp->dccps_l_ack_ratio = val;
+ /*
+ * After a loss, idle period, application limited period, or RTO we
+ * need to check that the ack ratio is still less than the congestion
+ * window. Otherwise, we will send an entire congestion window of
+ * packets and got no response because we haven't sent ack ratio
+ * packets yet.
+ * If the ack ratio does need to be reduced, we reduce it to half of
+ * the congestion window (or 1 if that's zero) instead of to the
+ * congestion window. This prevents problems if one ack is lost.
+ */
+ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
+ ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
}
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
+static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
{
- ccid2_pr_debug("change SRTT to %ld\n", val);
- hc->tx_srtt = val;
+ dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
+ clamp_val(val, DCCPF_SEQ_WMIN,
+ DCCPF_SEQ_WMAX));
}
-static void ccid2_start_rto_timer(struct sock *sk);
-
static void ccid2_hc_tx_rto_expire(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- long s;
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
@@ -177,23 +140,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
ccid2_pr_debug("RTO_EXPIRE\n");
- ccid2_hc_tx_check_sanity(hc);
-
/* back-off timer */
hc->tx_rto <<= 1;
-
- s = hc->tx_rto / HZ;
- if (s > 60)
- hc->tx_rto = 60 * HZ;
-
- ccid2_start_rto_timer(sk);
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
/* adjust pipe, cwnd etc */
hc->tx_ssthresh = hc->tx_cwnd / 2;
if (hc->tx_ssthresh < 2)
hc->tx_ssthresh = 2;
- hc->tx_cwnd = 1;
- hc->tx_pipe = 0;
+ hc->tx_cwnd = 1;
+ hc->tx_pipe = 0;
/* clear state about stuff we sent */
hc->tx_seqt = hc->tx_seqh;
@@ -203,33 +160,108 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
hc->tx_rpseq = 0;
hc->tx_rpdupack = -1;
ccid2_change_l_ack_ratio(sk, 1);
- ccid2_hc_tx_check_sanity(hc);
+
+ /* if we were blocked before, we may now send cwnd=1 packet */
+ if (sender_was_blocked)
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ /* restart backed-off timer */
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-static void ccid2_start_rto_timer(struct sock *sk)
+/*
+ * Congestion window validation (RFC 2861).
+ */
+static bool ccid2_do_cwv = true;
+module_param(ccid2_do_cwv, bool, 0644);
+MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
+
+/**
+ * ccid2_update_used_window - Track how much of cwnd is actually used
+ * This is done in addition to CWV. The sender needs to have an idea of how many
+ * packets may be in flight, to set the local Sequence Window value accordingly
+ * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
+ * maximum-used window. We use an EWMA low-pass filter to filter out noise.
+ */
+static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
+{
+ hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
+}
+
+/* This borrows the code of tcp_cwnd_application_limited() */
+static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /* don't reduce cwnd below the initial window (IW) */
+ u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
+ win_used = max(hc->tx_cwnd_used, init_win);
+
+ if (win_used < hc->tx_cwnd) {
+ hc->tx_ssthresh = max(hc->tx_ssthresh,
+ (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
+ hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
+ }
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
- ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto);
+ ccid2_check_l_ack_ratio(sk);
+}
- BUG_ON(timer_pending(&hc->tx_rtotimer));
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+/* This borrows the code of tcp_cwnd_restart() */
+static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ u32 cwnd = hc->tx_cwnd, restart_cwnd,
+ iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
+
+ hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
+
+ /* don't reduce cwnd below the initial window (IW) */
+ restart_cwnd = min(cwnd, iwnd);
+ cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
+ hc->tx_cwnd = max(cwnd, restart_cwnd);
+
+ hc->tx_cwnd_stamp = now;
+ hc->tx_cwnd_used = 0;
+
+ ccid2_check_l_ack_ratio(sk);
}
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const u32 now = ccid2_time_stamp;
struct ccid2_seq *next;
- hc->tx_pipe++;
+ /* slow-start after idle periods (RFC 2581, RFC 2861) */
+ if (ccid2_do_cwv && !hc->tx_pipe &&
+ (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
+ ccid2_cwnd_restart(sk, now);
+
+ hc->tx_lsndtime = now;
+ hc->tx_pipe += 1;
+
+ /* see whether cwnd was fully used (RFC 2861), update expected window */
+ if (ccid2_cwnd_network_limited(hc)) {
+ ccid2_update_used_window(hc, hc->tx_cwnd);
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+ } else {
+ if (hc->tx_pipe > hc->tx_cwnd_used)
+ hc->tx_cwnd_used = hc->tx_pipe;
+
+ ccid2_update_used_window(hc, hc->tx_cwnd_used);
+
+ if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
+ ccid2_cwnd_application_limited(sk, now);
+ }
hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
hc->tx_seqh->ccid2s_acked = 0;
- hc->tx_seqh->ccid2s_sent = jiffies;
+ hc->tx_seqh->ccid2s_sent = now;
next = hc->tx_seqh->ccid2s_next;
/* check if we need to alloc more space */
@@ -295,221 +327,190 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
}
#endif
- /* setup RTO timer */
- if (!timer_pending(&hc->tx_rtotimer))
- ccid2_start_rto_timer(sk);
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
do {
struct ccid2_seq *seqp = hc->tx_seqt;
while (seqp != hc->tx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
+ ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
(unsigned long long)seqp->ccid2s_seq,
seqp->ccid2s_acked, seqp->ccid2s_sent);
seqp = seqp->ccid2s_next;
}
} while (0);
ccid2_pr_debug("=========\n");
- ccid2_hc_tx_check_sanity(hc);
#endif
}
-/* XXX Lame code duplication!
- * returns -1 if none was found.
- * else returns the next offset to use in the function call.
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ * which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
*/
-static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
- unsigned char **vec, unsigned char *veclen)
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- unsigned char opt, len;
- unsigned char *value;
-
- BUG_ON(offset < 0);
- options += offset;
- opt_ptr = options;
- if (opt_ptr >= opt_end)
- return -1;
-
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_invalid_option;
-
- len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ long m = mrtt ? : 1;
+
+ if (hc->tx_srtt == 0) {
+ /* First measurement m */
+ hc->tx_srtt = m << 3;
+ hc->tx_mdev = m << 1;
+
+ hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+ hc->tx_rttvar = hc->tx_mdev_max;
+
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ } else {
+ /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+ m -= (hc->tx_srtt >> 3);
+ hc->tx_srtt += m;
+
+ /* Similarly, update scaled mdev with regard to |m| */
+ if (m < 0) {
+ m = -m;
+ m -= (hc->tx_mdev >> 2);
/*
- * Remove the type and len fields, leaving
- * just the value size
+ * This neutralises RTO increase when RTT < SRTT - mdev
+ * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+ * in Linux TCP", USENIX 2002, pp. 49-62).
*/
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (hc->tx_mdev >> 2);
+ }
+ hc->tx_mdev += m;
- if (opt_ptr > opt_end)
- goto out_invalid_option;
+ if (hc->tx_mdev > hc->tx_mdev_max) {
+ hc->tx_mdev_max = hc->tx_mdev;
+ if (hc->tx_mdev_max > hc->tx_rttvar)
+ hc->tx_rttvar = hc->tx_mdev_max;
}
- switch (opt) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- *vec = value;
- *veclen = len;
- return offset + (opt_ptr - options);
+ /*
+ * Decay RTTVAR at most once per flight, exploiting that
+ * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
+ * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
+ * GAR is a useful bound for FlightSize = pipe.
+ * AWL is probably too low here, as it over-estimates pipe.
+ */
+ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+ if (hc->tx_mdev_max < hc->tx_rttvar)
+ hc->tx_rttvar -= (hc->tx_rttvar -
+ hc->tx_mdev_max) >> 2;
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ hc->tx_mdev_max = tcp_rto_min(sk);
}
}
- return -1;
-
-out_invalid_option:
- DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
- return -1;
-}
-
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /*
+ * Set RTO from SRTT and RTTVAR
+ * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+ * This agrees with RFC 4341, 5:
+ * "Because DCCP does not retransmit data, DCCP does not require
+ * TCP's recommended minimum timeout of one second".
+ */
+ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
- sk_stop_timer(sk, &hc->tx_rtotimer);
- ccid2_pr_debug("deleted RTO timer\n");
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
}
-static inline void ccid2_new_ack(struct sock *sk,
- struct ccid2_seq *seqp,
- unsigned int *maxincr)
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+ unsigned int *maxincr)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if (hc->tx_cwnd < hc->tx_ssthresh) {
- if (*maxincr > 0 && ++hc->tx_packets_acked == 2) {
- hc->tx_cwnd += 1;
- *maxincr -= 1;
- hc->tx_packets_acked = 0;
- }
- } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
+
+ if (hc->tx_cwnd < dp->dccps_l_seq_win &&
+ r_seq_used < dp->dccps_r_seq_win) {
+ if (hc->tx_cwnd < hc->tx_ssthresh) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
+ hc->tx_cwnd += 1;
+ *maxincr -= 1;
+ hc->tx_packets_acked = 0;
+ }
+ } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
hc->tx_cwnd += 1;
hc->tx_packets_acked = 0;
- }
-
- /* update RTO */
- if (hc->tx_srtt == -1 ||
- time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) {
- unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
- int s;
-
- /* first measurement */
- if (hc->tx_srtt == -1) {
- ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
- r, jiffies,
- (unsigned long long)seqp->ccid2s_seq);
- ccid2_change_srtt(hc, r);
- hc->tx_rttvar = r >> 1;
- } else {
- /* RTTVAR */
- long tmp = hc->tx_srtt - r;
- long srtt;
-
- if (tmp < 0)
- tmp *= -1;
-
- tmp >>= 2;
- hc->tx_rttvar *= 3;
- hc->tx_rttvar >>= 2;
- hc->tx_rttvar += tmp;
-
- /* SRTT */
- srtt = hc->tx_srtt;
- srtt *= 7;
- srtt >>= 3;
- tmp = r >> 3;
- srtt += tmp;
- ccid2_change_srtt(hc, srtt);
}
- s = hc->tx_rttvar << 2;
- /* clock granularity is 1 when based on jiffies */
- if (!s)
- s = 1;
- hc->tx_rto = hc->tx_srtt + s;
-
- /* must be at least a second */
- s = hc->tx_rto / HZ;
- /* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
- if (s < 1)
- hc->tx_rto = HZ;
-#endif
- /* max 60 seconds */
- if (s > 60)
- hc->tx_rto = HZ * 60;
-
- hc->tx_lastrtt = jiffies;
-
- ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
- hc->tx_srtt, hc->tx_rttvar,
- hc->tx_rto, HZ, r);
}
- /* we got a new ack, so re-start RTO timer */
- ccid2_hc_tx_kill_rto_timer(sk);
- ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /*
+ * Adjust the local sequence window and the ack ratio to allow about
+ * 5 times the number of packets in the network (RFC 4340 7.5.2)
+ */
+ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
+ else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
- if (hc->tx_pipe == 0)
- DCCP_BUG("pipe == 0");
- else
- hc->tx_pipe--;
+ if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
+ else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
- if (hc->tx_pipe == 0)
- ccid2_hc_tx_kill_rto_timer(sk);
+ /*
+ * FIXME: RTT is sampled several times per acknowledgment (for each
+ * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+ * This causes the RTT to be over-estimated, since the older entries
+ * in the Ack Vector have earlier sending times.
+ * The cleanest solution is to not use the ccid2s_sent field at all
+ * and instead use DCCP timestamps: requires changes in other places.
+ */
+ ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
}
static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) {
+ if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
return;
}
- hc->tx_last_cong = jiffies;
+ hc->tx_last_cong = ccid2_time_stamp;
hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
- /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
- if (dccp_sk(sk)->dccps_l_ack_ratio > hc->tx_cwnd)
- ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ switch (option) {
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+ option - DCCPO_ACK_VECTOR_0);
+ }
+ return 0;
}
static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+ struct dccp_ackvec_parsed *avp;
u64 ackno, seqno;
struct ccid2_seq *seqp;
- unsigned char *vector;
- unsigned char veclen;
- int offset = 0;
int done = 0;
unsigned int maxincr = 0;
- ccid2_hc_tx_check_sanity(hc);
/* check reverse path congestion */
seqno = DCCP_SKB_CB(skb)->dccpd_seq;
@@ -533,24 +534,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
if (hc->tx_rpdupack >= NUMDUPACK) {
hc->tx_rpdupack = -1; /* XXX lame */
hc->tx_rpseq = 0;
-
+#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
+ /*
+ * FIXME: Ack Congestion Control is broken; in
+ * the current state instabilities occurred with
+ * Ack Ratios greater than 1; causing hang-ups
+ * and long RTO timeouts. This needs to be fixed
+ * before opening up dynamic changes. -- gerrit
+ */
ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+#endif
}
}
}
/* check forward path congestion */
- /* still didn't send out new data packets */
- if (hc->tx_seqh == hc->tx_seqt)
+ if (dccp_packet_without_ack(skb))
return;
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- case DCCP_PKT_DATAACK:
- break;
- default:
- return;
- }
+ /* still didn't send out new data packets */
+ if (hc->tx_seqh == hc->tx_seqt)
+ goto done;
ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
if (after48(ackno, hc->tx_high_ack))
@@ -574,16 +578,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
/* go through all ack vectors */
- while ((offset = ccid2_ackvector(sk, skb, offset,
- &vector, &veclen)) != -1) {
+ list_for_each_entry(avp, &hc->tx_av_chunks, node) {
/* go through this ack vector */
- while (veclen--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl = SUB48(ackno, rl);
+ for (; avp->len--; avp->vec++) {
+ u64 ackno_end_rl = SUB48(ackno,
+ dccp_ackvec_runlen(avp->vec));
- ccid2_pr_debug("ackvec start:%llu end:%llu\n",
+ ccid2_pr_debug("ackvec %llu |%u,%u|\n",
(unsigned long long)ackno,
- (unsigned long long)ackno_end_rl);
+ dccp_ackvec_state(avp->vec) >> 6,
+ dccp_ackvec_runlen(avp->vec));
/* if the seqno we are analyzing is larger than the
* current ackno, then move towards the tail of our
* seqnos.
@@ -602,24 +606,22 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* run length
*/
while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = *vector &
- DCCP_ACKVEC_STATE_MASK;
+ const u8 state = dccp_ackvec_state(avp->vec);
/* new packet received or marked */
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+ if (state != DCCPAV_NOT_RECEIVED &&
!seqp->ccid2s_acked) {
- if (state ==
- DCCP_ACKVEC_STATE_ECN_MARKED) {
+ if (state == DCCPAV_ECN_MARKED)
ccid2_congestion_event(sk,
seqp);
- } else
+ else
ccid2_new_ack(sk, seqp,
&maxincr);
seqp->ccid2s_acked = 1;
ccid2_pr_debug("Got ack for %llu\n",
(unsigned long long)seqp->ccid2s_seq);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
if (seqp == hc->tx_seqt) {
done = 1;
@@ -631,7 +633,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
break;
ackno = SUB48(ackno_end_rl, 1);
- vector++;
}
if (done)
break;
@@ -676,7 +677,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* one ack vector.
*/
ccid2_congestion_event(sk, seqp);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
if (seqp == hc->tx_seqt)
break;
@@ -694,7 +695,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hc->tx_seqt = hc->tx_seqt->ccid2s_next;
}
- ccid2_hc_tx_check_sanity(hc);
+ /* restart RTO timer if not all outstanding data has been acked */
+ if (hc->tx_pipe == 0)
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+ else
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+done:
+ /* check if incoming Acks allow pending packets to be sent */
+ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -706,12 +716,9 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
hc->tx_ssthresh = ~0U;
- /*
- * RFC 4341, 5: "The cwnd parameter is initialized to at most four
- * packets for new connections, following the rules from [RFC3390]".
- * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
- */
- hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+ /* Use larger initial windows (RFC 4341, section 5). */
+ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+ hc->tx_expected_wnd = hc->tx_cwnd;
/* Make sure that Ack Ratio is enabled and within bounds. */
max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
@@ -722,15 +729,13 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
if (ccid2_hc_tx_alloc_seq(hc))
return -ENOMEM;
- hc->tx_rto = 3 * HZ;
- ccid2_change_srtt(hc, -1);
- hc->tx_rttvar = -1;
+ hc->tx_rto = DCCP_TIMEOUT_INIT;
hc->tx_rpdupack = -1;
- hc->tx_last_cong = jiffies;
+ hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
+ hc->tx_cwnd_used = 0;
setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
(unsigned long)sk);
-
- ccid2_hc_tx_check_sanity(hc);
+ INIT_LIST_HEAD(&hc->tx_av_chunks);
return 0;
}
@@ -739,7 +744,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
int i;
- ccid2_hc_tx_kill_rto_timer(sk);
+ sk_stop_timer(sk, &hc->tx_rtotimer);
for (i = 0; i < hc->tx_seqbufc; i++)
kfree(hc->tx_seqbuf[i]);
@@ -748,32 +753,29 @@ static void ccid2_hc_tx_exit(struct sock *sk)
static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- const struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- hc->rx_data++;
- if (hc->rx_data >= dp->dccps_r_ack_ratio) {
- dccp_send_ack(sk);
- hc->rx_data = 0;
- }
- break;
+ if (!dccp_data_packet(skb))
+ return;
+
+ if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
+ dccp_send_ack(sk);
+ hc->rx_num_data_pkts = 0;
}
}
struct ccid_operations ccid2_ops = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "TCP-like",
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
+ .ccid_id = DCCPC_CCID2,
+ .ccid_name = "TCP-like",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
+ .ccid_hc_tx_init = ccid2_hc_tx_init,
+ .ccid_hc_tx_exit = ccid2_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
+ .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+ .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
+ .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
};
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103b..18c97543e52 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -18,18 +18,23 @@
#ifndef _DCCP_CCID2_H_
#define _DCCP_CCID2_H_
-#include <linux/dccp.h>
#include <linux/timer.h>
#include <linux/types.h>
#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp tcp_time_stamp
+
/* NUMDUPACK parameter from RFC 4341, p. 6 */
#define NUMDUPACK 3
-struct sock;
-
struct ccid2_seq {
u64 ccid2s_seq;
- unsigned long ccid2s_sent;
+ u32 ccid2s_sent;
int ccid2s_acked;
struct ccid2_seq *ccid2s_prev;
struct ccid2_seq *ccid2s_next;
@@ -38,13 +43,29 @@ struct ccid2_seq {
#define CCID2_SEQBUF_LEN 1024
#define CCID2_SEQBUF_MAX 128
+/*
+ * Multiple of congestion window to keep the sequence window at
+ * (RFC 4340 7.5.2)
+ */
+#define CCID2_WIN_CHANGE_FACTOR 5
+
/**
* struct ccid2_hc_tx_sock - CCID2 TX half connection
* @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
* @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @tx_lastrtt: time RTT was last measured
+ * @tx_srtt: smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev: smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max: maximum of @mdev during one flight
+ * @tx_rttvar: moving average/maximum of @mdev_max
+ * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq: to decay RTTVAR at most once per flight
+ * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
+ * @tx_expected_wnd: moving average of @tx_cwnd_used
+ * @tx_cwnd_stamp: to track idle periods in CWV
+ * @tx_lsndtime: last time (in jiffies) a data packet was sent
* @tx_rpseq: last consecutive seqno
* @tx_rpdupack: dupacks since rpseq
+ * @tx_av_chunks: list of Ack Vectors received on current skb
*/
struct ccid2_hc_tx_sock {
u32 tx_cwnd;
@@ -55,19 +76,49 @@ struct ccid2_hc_tx_sock {
int tx_seqbufc;
struct ccid2_seq *tx_seqh;
struct ccid2_seq *tx_seqt;
- long tx_rto;
- long tx_srtt;
- long tx_rttvar;
- unsigned long tx_lastrtt;
+
+ /* RTT measurement: variables/principles are the same as in TCP */
+ u32 tx_srtt,
+ tx_mdev,
+ tx_mdev_max,
+ tx_rttvar,
+ tx_rto;
+ u64 tx_rtt_seq:48;
struct timer_list tx_rtotimer;
+
+ /* Congestion Window validation (optional, RFC 2861) */
+ u32 tx_cwnd_used,
+ tx_expected_wnd,
+ tx_cwnd_stamp,
+ tx_lsndtime;
+
u64 tx_rpseq;
int tx_rpdupack;
- unsigned long tx_last_cong;
+ u32 tx_last_cong;
u64 tx_high_ack;
+ struct list_head tx_av_chunks;
};
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+ return hc->tx_pipe >= hc->tx_cwnd;
+}
+
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
+/**
+ * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
+ * @rx_num_data_pkts: number of data packets received since last feedback
+ */
struct ccid2_hc_rx_sock {
- int rx_data;
+ u32 rx_num_data_pkts;
};
static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index bcd7632299f..119c04317d4 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -38,7 +38,7 @@
#include <asm/unaligned.h>
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static int ccid3_debug;
+static bool ccid3_debug;
#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
#else
#define ccid3_pr_debug(format, a...)
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
[TFRC_SSTATE_NO_SENT] = "NO_SENT",
[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
[TFRC_SSTATE_FBACK] = "FBACK",
- [TFRC_SSTATE_TERM] = "TERM",
};
return ccid3_state_names[state];
@@ -91,19 +90,17 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
return scaled_div(w_init << 6, hc->tx_rtt);
}
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
*/
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
{
- /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
- /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
- hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
-
- ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
- hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
+ DCCP_BUG_ON(hc->tx_t_ipi == 0);
+ ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+ hc->tx_s, (unsigned int)(hc->tx_x >> 6));
}
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -116,6 +113,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
/**
* ccid3_hc_tx_update_x - Update allowed sending rate X
* @stamp: most recent time if available - can be left NULL.
+ *
* This function tracks draft rfc3448bis, check there for latest details.
*
* Note: X and X_recv are both stored in units of 64 * bytes/second, to support
@@ -156,17 +154,19 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
if (hc->tx_x != old_x) {
ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
- "X_recv=%u\n", (unsigned)(old_x >> 6),
- (unsigned)(hc->tx_x >> 6), hc->tx_x_calc,
- (unsigned)(hc->tx_x_recv >> 6));
+ "X_recv=%u\n", (unsigned int)(old_x >> 6),
+ (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6));
ccid3_update_send_interval(hc);
}
}
-/*
- * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1)
+/**
+ * ccid3_hc_tx_update_s - Track the mean packet size `s'
* @len: DCCP packet payload size in bytes
+ *
+ * cf. RFC 4342, 5.3 and RFC 3448, 4.1
*/
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
{
@@ -208,19 +208,22 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
goto restart_timer;
}
- ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
+ ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
ccid3_tx_state_name(hc->tx_state));
+ /* Ignore and do not restart after leaving the established state */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+ goto out;
+
+ /* Reset feedback state to "no feedback received" */
if (hc->tx_state == TFRC_SSTATE_FBACK)
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
- goto out;
/*
* Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+ * RTO is 0 if and only if no feedback has been received yet.
*/
- if (hc->tx_t_rto == 0 || /* no feedback received yet */
- hc->tx_p == 0) {
+ if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
/* halve send rate directly */
hc->tx_x = max(hc->tx_x / 2,
@@ -237,8 +240,6 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
*
* Note that X_recv is scaled by 2^6 while X_calc is not
*/
- BUG_ON(hc->tx_p && !hc->tx_x_calc);
-
if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
hc->tx_x_recv =
max(hc->tx_x_recv / 2,
@@ -256,7 +257,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
* Set new timeout for the nofeedback timer.
* See comments in packet_recv() regarding the value of t_RTO.
*/
- if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */
+ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
t_nfb = TFRC_INITIAL_TIMEOUT;
else
t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -269,11 +270,12 @@ out:
sock_put(sk);
}
-/*
- * returns
- * > 0: delay (in msecs) that should pass before actually sending
- * = 0: can send immediately
- * < 0: error condition; do not send packet
+/**
+ * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ *
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
*/
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
@@ -290,8 +292,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
if (unlikely(skb->len == 0))
return -EBADMSG;
- switch (hc->tx_state) {
- case TFRC_SSTATE_NO_SENT:
+ if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
hc->tx_last_win_count = 0;
@@ -326,27 +327,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
ccid3_update_send_interval(hc);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- break;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
+
+ } else {
delay = ktime_us_delta(hc->tx_t_nom, now);
ccid3_pr_debug("delay=%ld\n", (long)delay);
/*
- * Scheduling of packet transmissions [RFC 3448, 4.6]
+ * Scheduling of packet transmissions (RFC 5348, 8.3)
*
* if (t_now > t_nom - delta)
* // send the packet now
* else
* // send the packet in (t_nom - t_now) milliseconds.
*/
- if (delay - (s64)hc->tx_delta >= 1000)
- return (u32)delay / 1000L;
+ if (delay >= TFRC_T_DELTA)
+ return (u32)delay / USEC_PER_MSEC;
ccid3_hc_tx_update_win_count(hc, now);
- break;
- case TFRC_SSTATE_TERM:
- DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
- return -EINVAL;
}
/* prepare to send now (add options etc.) */
@@ -355,11 +351,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
/* set the nominal send time for the next following packet */
hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
- return 0;
+ return CCID_PACKET_SEND_AT_ONCE;
}
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
- unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
@@ -372,48 +367,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
+ struct tfrc_tx_hist_entry *acked;
ktime_t now;
unsigned long t_nfb;
- u32 pinv, r_sample;
+ u32 r_sample;
/* we are only interested in ACKs */
if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
return;
- /* ... and only in the established state */
- if (hc->tx_state != TFRC_SSTATE_FBACK &&
- hc->tx_state != TFRC_SSTATE_NO_FBACK)
- return;
-
- opt_recv = &hc->tx_options_received;
- now = ktime_get_real();
-
- /* Estimate RTT from history if ACK number is valid */
- r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
- DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
- if (r_sample == 0) {
- DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
- dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
- return;
- }
-
- /* Update receive rate in units of 64 * bytes/second */
- hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
- hc->tx_x_recv <<= 6;
-
- /* Update loss event rate (which is scaled by 1e6) */
- pinv = opt_recv->ccid3or_loss_event_rate;
- if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
- hc->tx_p = 0;
- else /* can not exceed 100% */
- hc->tx_p = scaled_div(1, pinv);
/*
- * Validate new RTT sample and update moving average
+ * Locate the acknowledged packet in the TX history.
+ *
+ * Returning "entry not found" here can for instance happen when
+ * - the host has not sent out anything (e.g. a passive server),
+ * - the Ack is outdated (packet with higher Ack number was received),
+ * - it is a bogus Ack (for a packet not sent on this connection).
*/
- r_sample = dccp_sample_rtt(sk, r_sample);
+ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+ if (acked == NULL)
+ return;
+ /* For the sake of RTT sampling, ignore/remove all older entries */
+ tfrc_tx_hist_purge(&acked->next);
+
+ /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+ now = ktime_get_real();
+ r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
/*
* Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
*/
@@ -448,8 +429,8 @@ done_computing_x:
"p=%u, X_calc=%u, X_recv=%u, X=%u\n",
dccp_role(sk), sk, hc->tx_rtt, r_sample,
hc->tx_s, hc->tx_p, hc->tx_x_calc,
- (unsigned)(hc->tx_x_recv >> 6),
- (unsigned)(hc->tx_x >> 6));
+ (unsigned int)(hc->tx_x_recv >> 6),
+ (unsigned int)(hc->tx_x >> 6));
/* unschedule no feedback timer */
sk_stop_timer(sk, &hc->tx_no_feedback_timer);
@@ -461,13 +442,12 @@ done_computing_x:
sk->sk_write_space(sk);
/*
- * Update timeout interval for the nofeedback timer.
- * We use a configuration option to increase the lower bound.
- * This can help avoid triggering the nofeedback timer too
- * often ('spinning') on LANs with small RTTs.
+ * Update timeout interval for the nofeedback timer. In order to control
+ * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+ * tunable RTAX_RTO_MIN value as the lower bound.
*/
- hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO *
- (USEC_PER_SEC / 1000)));
+ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+ USEC_PER_SEC/HZ * tcp_rto_min(sk));
/*
* Schedule no feedback timer to expire in
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
@@ -482,66 +462,41 @@ done_computing_x:
jiffies + usecs_to_jiffies(t_nfb));
}
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
- unsigned char len, u16 idx,
- unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
{
- int rc = 0;
- const struct dccp_sock *dp = dccp_sk(sk);
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
__be32 opt_val;
- opt_recv = &hc->tx_options_received;
-
- if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
- opt_recv->ccid3or_seqno = dp->dccps_gsr;
- opt_recv->ccid3or_loss_event_rate = ~0;
- opt_recv->ccid3or_loss_intervals_idx = 0;
- opt_recv->ccid3or_loss_intervals_len = 0;
- opt_recv->ccid3or_receive_rate = 0;
- }
-
switch (option) {
+ case TFRC_OPT_RECEIVE_RATE:
case TFRC_OPT_LOSS_EVENT_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_LOSS_EVENT_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
- ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_event_rate);
+ /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+ if (packet_type == DCCP_PKT_DATA)
+ break;
+ if (unlikely(optlen != 4)) {
+ DCCP_WARN("%s(%p), invalid len %d for %u\n",
+ dccp_role(sk), sk, optlen, option);
+ return -EINVAL;
}
- break;
- case TFRC_OPT_LOSS_INTERVALS:
- opt_recv->ccid3or_loss_intervals_idx = idx;
- opt_recv->ccid3or_loss_intervals_len = len;
- ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_intervals_idx,
- opt_recv->ccid3or_loss_intervals_len);
- break;
- case TFRC_OPT_RECEIVE_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_RECEIVE_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+ opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+ if (option == TFRC_OPT_RECEIVE_RATE) {
+ /* Receive Rate is kept in units of 64 bytes/second */
+ hc->tx_x_recv = opt_val;
+ hc->tx_x_recv <<= 6;
+
ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_receive_rate);
+ dccp_role(sk), sk, opt_val);
+ } else {
+ /* Update the fixpoint Loss Event Rate fraction */
+ hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+ ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
}
- break;
}
-
- return rc;
+ return 0;
}
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +514,37 @@ static void ccid3_hc_tx_exit(struct sock *sk)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
tfrc_tx_hist_purge(&hc->tx_hist);
}
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
- struct ccid3_hc_tx_sock *hc;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hc = ccid3_hc_tx_sk(sk);
- info->tcpi_rto = hc->tx_t_rto;
- info->tcpi_rtt = hc->tx_rtt;
+ info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+ info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
}
static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_tx_sock *hc;
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_info tfrc;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hc = ccid3_hc_tx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(hc->tx_tfrc))
+ if (len < sizeof(tfrc))
return -EINVAL;
- len = sizeof(hc->tx_tfrc);
- val = &hc->tx_tfrc;
+ memset(&tfrc, 0, sizeof(tfrc));
+ tfrc.tfrctx_x = hc->tx_x;
+ tfrc.tfrctx_x_recv = hc->tx_x_recv;
+ tfrc.tfrctx_x_calc = hc->tx_x_calc;
+ tfrc.tfrctx_rtt = hc->tx_rtt;
+ tfrc.tfrctx_p = hc->tx_p;
+ tfrc.tfrctx_rto = hc->tx_t_rto;
+ tfrc.tfrctx_ipi = hc->tx_t_ipi;
+ len = sizeof(tfrc);
+ val = &tfrc;
break;
default:
return -ENOPROTOOPT;
@@ -624,7 +574,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
static const char *const ccid3_rx_state_names[] = {
[TFRC_RSTATE_NO_DATA] = "NO_DATA",
[TFRC_RSTATE_DATA] = "DATA",
- [TFRC_RSTATE_TERM] = "TERM",
};
return ccid3_rx_state_names[state];
@@ -650,14 +599,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- ktime_t now;
+ ktime_t now = ktime_get_real();
s64 delta = 0;
- if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
- return;
-
- now = ktime_get_real();
-
switch (fbtype) {
case CCID3_FBACK_INITIAL:
hc->rx_x_recv = 0;
@@ -701,23 +645,21 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
{
- const struct ccid3_hc_rx_sock *hc;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
__be32 x_recv, pinv;
if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
return 0;
- hc = ccid3_hc_rx_sk(sk);
-
if (dccp_packet_without_ack(skb))
return 0;
x_recv = htonl(hc->rx_x_recv);
pinv = htonl(hc->rx_pinv);
- if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+ if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
&pinv, sizeof(pinv)) ||
- dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+ dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
&x_recv, sizeof(x_recv)))
return -1;
@@ -749,10 +691,11 @@ static u32 ccid3_first_li(struct sock *sk)
x_recv = scaled_div32(hc->rx_bytes_recv, delta);
if (x_recv == 0) { /* would also trigger divide-by-zero */
DCCP_WARN("X_recv==0\n");
- if ((x_recv = hc->rx_x_recv) == 0) {
+ if (hc->rx_x_recv == 0) {
DCCP_BUG("stored value of X_recv is zero");
return ~0U;
}
+ x_recv = hc->rx_x_recv;
}
fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +805,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
tfrc_rx_hist_purge(&hc->rx_hist);
tfrc_lh_cleanup(&hc->rx_li_hist);
}
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct ccid3_hc_rx_sock *hc;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hc = ccid3_hc_rx_sk(sk);
- info->tcpi_ca_state = hc->rx_state;
+ info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = hc->rx_rtt;
+ info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
}
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_rx_sock *hc;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct tfrc_rx_info rx_info;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hc = ccid3_hc_rx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_RX_INFO:
if (len < sizeof(rx_info))
return -EINVAL;
rx_info.tfrcrx_x_recv = hc->rx_x_recv;
rx_info.tfrcrx_rtt = hc->rx_rtt;
- rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U :
- scaled_div(1, hc->rx_pinv);
+ rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
len = sizeof(rx_info);
val = &rx_info;
break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 03263577665..1a9933c2967 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -42,35 +42,36 @@
#include "lib/tfrc.h"
#include "../ccid.h"
-/* Two seconds as per RFC 3448 4.2 */
+/* Two seconds as per RFC 5348, 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
-
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
+#endif
+
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,
TFRC_OPT_RECEIVE_RATE = 194,
};
-struct ccid3_options_received {
- u64 ccid3or_seqno:48,
- ccid3or_loss_intervals_idx:16;
- u16 ccid3or_loss_intervals_len;
- u32 ccid3or_loss_event_rate;
- u32 ccid3or_receive_rate;
-};
-
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
- TFRC_SSTATE_TERM,
};
/**
@@ -90,19 +91,16 @@ enum ccid3_hc_tx_states {
* @tx_no_feedback_timer: Handle to no feedback timer
* @tx_t_ld: Time last doubled during slow start
* @tx_t_nom: Nominal send time of next packet
- * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs
* @tx_hist: Packet history
- * @tx_options_received: Parsed set of retrieved options
*/
struct ccid3_hc_tx_sock {
- struct tfrc_tx_info tx_tfrc;
-#define tx_x tx_tfrc.tfrctx_x
-#define tx_x_recv tx_tfrc.tfrctx_x_recv
-#define tx_x_calc tx_tfrc.tfrctx_x_calc
-#define tx_rtt tx_tfrc.tfrctx_rtt
-#define tx_p tx_tfrc.tfrctx_p
-#define tx_t_rto tx_tfrc.tfrctx_rto
-#define tx_t_ipi tx_tfrc.tfrctx_ipi
+ u64 tx_x;
+ u64 tx_x_recv;
+ u32 tx_x_calc;
+ u32 tx_rtt;
+ u32 tx_p;
+ u32 tx_t_rto;
+ u32 tx_t_ipi;
u16 tx_s;
enum ccid3_hc_tx_states tx_state:8;
u8 tx_last_win_count;
@@ -110,9 +108,7 @@ struct ccid3_hc_tx_sock {
struct timer_list tx_no_feedback_timer;
ktime_t tx_t_ld;
ktime_t tx_t_nom;
- u32 tx_delta;
struct tfrc_tx_hist_entry *tx_hist;
- struct ccid3_options_received tx_options_received;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -126,21 +122,16 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
- TFRC_RSTATE_TERM = 127,
};
/**
* struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3)
- * @rx_rtt: Receiver estimate of rtt (non-standard)
- * @rx_p: Current loss event rate (RFC 3448 5.4)
* @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
* @rx_state: Receiver state, one of %ccid3_hc_rx_states
* @rx_bytes_recv: Total sum of DCCP payload bytes
* @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @rx_rtt: Receiver estimate of RTT
* @rx_tstamp_last_feedback: Time at which last feedback was sent
- * @rx_tstamp_last_ack: Time at which last feedback was sent
* @rx_hist: Packet history (loss detection + RTT sampling)
* @rx_li_hist: Loss Interval database
* @rx_s: Received packet size in bytes
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf7907..57f9fd78c4d 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
cur->li_length = len;
tfrc_lh_calc_i_mean(lh);
- return (lh->i_mean < old_i_mean);
+ return lh->i_mean < old_i_mean;
}
/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
@@ -133,6 +133,7 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
* @rh: Receive history containing a fresh loss event
* @calc_first_li: Caller-dependent routine to compute length of first interval
* @sk: Used by @calc_first_li in caller-specific way (subtyping)
+ *
* Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
*/
int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index d1d2f5383b7..57f631a86cc 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -65,9 +65,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
struct tfrc_rx_hist;
-extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
- u32 (*first_li)(struct sock *), struct sock *);
-extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
-extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
+int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+ u32 (*first_li)(struct sock *), struct sock *);
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 3a4f414e94a..08df7a3acb3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -38,18 +38,6 @@
#include "packet_history.h"
#include "../../dccp.h"
-/**
- * tfrc_tx_hist_entry - Simple singly-linked TX history list
- * @next: next oldest entry (LIFO order)
- * @seqno: sequence number of this entry
- * @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
- struct tfrc_tx_hist_entry *next;
- u64 seqno;
- ktime_t stamp;
-};
-
/*
* Transmitter History Routines
*/
@@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
}
}
-static struct tfrc_tx_hist_entry *
- tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
- while (head != NULL && head->seqno != seqno)
- head = head->next;
-
- return head;
-}
-
int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
{
struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
*headp = NULL;
}
-u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
- const ktime_t now)
-{
- u32 rtt = 0;
- struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
-
- if (packet != NULL) {
- rtt = ktime_us_delta(now, packet->stamp);
- /*
- * Garbage-collect older (irrelevant) entries:
- */
- tfrc_tx_hist_purge(&packet->next);
- }
-
- return rtt;
-}
-
-
/*
* Receiver History Routines
*/
@@ -354,6 +315,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
* @ndp: The NDP count belonging to @skb
* @calc_first_li: Caller-dependent computation of first loss interval in @lh
* @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
+ *
* Chooses action according to pending loss, updates LI database when a new
* loss was detected, and does required post-processing. Returns 1 when caller
* should send feedback, 0 otherwise.
@@ -426,7 +388,7 @@ static inline struct tfrc_rx_hist_entry *
}
/**
- * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 7df6c529999..ee362b0b630 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
#include <linux/slab.h>
#include "tfrc.h"
-struct tfrc_tx_hist_entry;
+/**
+ * tfrc_tx_hist_entry - Simple singly-linked TX history list
+ * @next: next oldest entry (LIFO order)
+ * @seqno: sequence number of this entry
+ * @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+ struct tfrc_tx_hist_entry *next;
+ u64 seqno;
+ ktime_t stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+ tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+ while (head != NULL && head->seqno != seqno)
+ head = head->next;
+ return head;
+}
-extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
-extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
- const u64 seqno, const ktime_t now);
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
/* Subtraction a-b modulo-16, respects circular wrap-around */
#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -123,20 +139,17 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
return h->loss_count > 0;
}
-extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
- const struct sk_buff *skb, const u64 ndp);
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
+ const u64 ndp);
-extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
struct tfrc_loss_hist;
-extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
- struct tfrc_loss_hist *lh,
- struct sk_buff *skb, const u64 ndp,
- u32 (*first_li)(struct sock *sk),
- struct sock *sk);
-extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
- const struct sk_buff *skb);
-extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
-extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*first_li)(struct sock *sk), struct sock *sk);
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index 4902029854d..62b5828acde 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -4,10 +4,11 @@
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
*/
+#include <linux/moduleparam.h>
#include "tfrc.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-int tfrc_debug;
+bool tfrc_debug;
module_param(tfrc_debug, bool, 0644);
MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
#endif
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 01bb48e96c2..40ee7d62b65 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -21,7 +21,7 @@
#include "packet_history.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-extern int tfrc_debug;
+extern bool tfrc_debug;
#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
#else
#define tfrc_pr_debug(format, a...)
@@ -55,20 +55,21 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
}
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
-extern int tfrc_tx_packet_history_init(void);
-extern void tfrc_tx_packet_history_exit(void);
-extern int tfrc_rx_packet_history_init(void);
-extern void tfrc_rx_packet_history_exit(void);
+int tfrc_tx_packet_history_init(void);
+void tfrc_tx_packet_history_exit(void);
+int tfrc_rx_packet_history_init(void);
+void tfrc_rx_packet_history_exit(void);
-extern int tfrc_li_init(void);
-extern void tfrc_li_exit(void);
+int tfrc_li_init(void);
+void tfrc_li_exit(void);
#ifdef CONFIG_IP_DCCP_TFRC_LIB
-extern int tfrc_lib_init(void);
-extern void tfrc_lib_exit(void);
+int tfrc_lib_init(void);
+void tfrc_lib_exit(void);
#else
#define tfrc_lib_init() (0)
#define tfrc_lib_exit()
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 22ca1cf0eb5..88ef98285be 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -611,6 +611,7 @@ static inline u32 tfrc_binsearch(u32 fval, u8 small)
* @s: packet size in bytes
* @R: RTT scaled by 1000000 (i.e., microseconds)
* @p: loss ratio estimate scaled by 1000000
+ *
* Returns X_calc in bytes per second (not scaled).
*/
u32 tfrc_calc_x(u16 s, u32 R, u32 p)
@@ -659,6 +660,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
/**
* tfrc_calc_x_reverse_lookup - try to find p given f(p)
* @fvalue: function value to match, scaled by 1000000
+ *
* Returns closest match for p, also scaled by 1000000
*/
u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
@@ -687,3 +689,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
+
+/**
+ * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+ if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
+ return 0;
+ if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
+ return 1000000;
+ return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 5ef32c2f0d6..c67816647cc 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -39,7 +39,7 @@
"%s: " fmt, __func__, ##a)
#ifdef CONFIG_IP_DCCP_DEBUG
-extern int dccp_debug;
+extern bool dccp_debug;
#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
@@ -53,7 +53,7 @@ extern struct inet_hashinfo dccp_hashinfo;
extern struct percpu_counter dccp_orphan_count;
-extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+void dccp_time_wait(struct sock *sk, int state, int timeo);
/*
* Set safe upper bounds for header and option length. Since Data Offset is 8
@@ -75,7 +75,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* state, about 60 seconds */
/* RFC 1122, 4.2.3.1 initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
/*
* The maximum back-off value for retransmissions. This is needed for
@@ -84,7 +84,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* - feature-negotiation retransmission (sec. 6.6.3),
* - Acks in client-PARTOPEN state (sec. 8.1.5).
*/
-#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
+#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
/*
* RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
@@ -93,9 +93,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
-/* Maximal interval between probes for local resources. */
-#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
-
/* sysctl variables for DCCP */
extern int sysctl_dccp_request_retries;
extern int sysctl_dccp_retries1;
@@ -153,18 +150,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
}
/**
- * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1
- * @s1: start sequence number
- * @s2: end sequence number
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1: last known sequence number before the loss ('hole')
+ * @s2: first sequence number seen after the 'hole'
* @ndp: NDP count on packet with sequence number @s2
- * Returns true if the sequence range s1...s2 has no data loss.
*/
-static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
{
s64 delta = dccp_delta_seqno(s1, s2);
WARN_ON(delta < 0);
- return (u64)delta <= ndp + 1;
+ delta -= ndp + 1;
+
+ return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+ return dccp_loss_count(s1, s2, ndp) == 0;
}
enum {
@@ -189,17 +195,12 @@ enum {
#define DCCP_MIB_MAX __DCCP_MIB_MAX
struct dccp_mib {
unsigned long mibs[DCCP_MIB_MAX];
-} __SNMP_MIB_ALIGN__;
+};
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-#define DCCP_ADD_STATS_BH(field, val) \
- SNMP_ADD_STATS_BH(dccp_statistics, field, val)
-#define DCCP_ADD_STATS_USER(field, val) \
- SNMP_ADD_STATS_USER(dccp_statistics, field, val)
/*
* Checksumming routines
@@ -223,101 +224,108 @@ static inline void dccp_csum_outgoing(struct sk_buff *skb)
skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
}
-extern void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
-extern int dccp_retransmit_skb(struct sock *sk);
+int dccp_retransmit_skb(struct sock *sk);
-extern void dccp_send_ack(struct sock *sk);
-extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
- struct request_sock *rsk);
+void dccp_send_ack(struct sock *sk);
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk);
-extern void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
-extern void dccp_write_xmit(struct sock *sk, int block);
-extern void dccp_write_space(struct sock *sk);
+/*
+ * TX Packet Dequeueing Interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+bool dccp_qpolicy_full(struct sock *sk);
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *dccp_qpolicy_top(struct sock *sk);
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
+
+/*
+ * TX Packet Output and TX Timers
+ */
+void dccp_write_xmit(struct sock *sk);
+void dccp_write_space(struct sock *sk);
+void dccp_flush_write_queue(struct sock *sk, long *time_budget);
-extern void dccp_init_xmit_timers(struct sock *sk);
+void dccp_init_xmit_timers(struct sock *sk);
static inline void dccp_clear_xmit_timers(struct sock *sk)
{
inet_csk_clear_xmit_timers(sk);
}
-extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
+const char *dccp_packet_name(const int type);
-extern void dccp_set_state(struct sock *sk, const int state);
-extern void dccp_done(struct sock *sk);
+void dccp_set_state(struct sock *sk, const int state);
+void dccp_done(struct sock *sk);
-extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
- struct sk_buff const *skb);
+int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+ struct sk_buff const *skb);
-extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-extern struct sock *dccp_create_openreq_child(struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
-extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst);
-extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev);
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev);
-extern int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len);
-extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len);
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned int len);
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len);
-extern int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
-extern void dccp_destroy_sock(struct sock *sk);
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
+void dccp_destroy_sock(struct sock *sk);
-extern void dccp_close(struct sock *sk, long timeout);
-extern struct sk_buff *dccp_make_response(struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req);
+void dccp_close(struct sock *sk, long timeout);
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req);
-extern int dccp_connect(struct sock *sk);
-extern int dccp_disconnect(struct sock *sk, int flags);
-extern int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-extern int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
+int dccp_connect(struct sock *sk);
+int dccp_disconnect(struct sock *sk, int flags);
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
#ifdef CONFIG_COMPAT
-extern int compat_dccp_getsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, int __user *optlen);
-extern int compat_dccp_setsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, unsigned int optlen);
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
#endif
-extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size);
-extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len);
-extern void dccp_shutdown(struct sock *sk, int how);
-extern int inet_dccp_listen(struct socket *sock, int backlog);
-extern unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len);
-
-extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
- struct sk_buff *skb);
-extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
-extern void dccp_send_close(struct sock *sk, const int active);
-extern int dccp_invalid_packet(struct sk_buff *skb);
-extern u32 dccp_sample_rtt(struct sock *sk, long delta);
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t size);
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t len, int nonblock, int flags,
+ int *addr_len);
+void dccp_shutdown(struct sock *sk, int how);
+int inet_dccp_listen(struct socket *sock, int backlog);
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
+void dccp_send_close(struct sock *sk, const int active);
+int dccp_invalid_packet(struct sk_buff *skb);
+u32 dccp_sample_rtt(struct sock *sk, long delta);
static inline int dccp_bad_service_code(const struct sock *sk,
const __be32 service)
@@ -338,12 +346,13 @@ static inline int dccp_bad_service_code(const struct sock *sk,
* @dccpd_opt_len: total length of all options (5.8) in the packet
* @dccpd_seq: sequence number
* @dccpd_ack_seq: acknowledgment number subheader field value
+ *
* This is used for transmission as well as for reception.
*/
struct dccp_skb_cb {
union {
struct inet_skb_parm h4;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header;
@@ -412,9 +421,27 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
- dp->dccps_gsr = seq;
+ if (after48(seq, dp->dccps_gsr))
+ dp->dccps_gsr = seq;
/* Sequence validity window depends on remote Sequence Window (7.5.1) */
dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+ /*
+ * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+ * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+ * always > 32, so for the first W/W' packets in the lifetime of a
+ * connection we always have to adjust SWL.
+ * A second reason why we are doing this is that the window depends on
+ * the feature-remote value of Sequence Window: nothing stops the peer
+ * from updating this value while we are busy adjusting SWL for the
+ * first W packets (we would have to count from scratch again then).
+ * Therefore it is safer to always make sure that the Sequence Window
+ * is not artificially extended by a peer who grows SWL downwards by
+ * continually updating the feature-remote Sequence-Window.
+ * If sequence numbers wrap it is bad luck. But that will take a while
+ * (48 bit), and this measure prevents Sequence-number attacks.
+ */
+ if (before48(dp->dccps_swl, dp->dccps_isr))
+ dp->dccps_swl = dp->dccps_isr;
dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
@@ -425,41 +452,41 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
dp->dccps_gss = seq;
/* Ack validity window depends on local Sequence Window value (7.5.1) */
dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+ /* Adjust AWL so that it is not below ISS - see comment above for SWL */
+ if (before48(dp->dccps_awl, dp->dccps_iss))
+ dp->dccps_awl = dp->dccps_iss;
dp->dccps_awh = dp->dccps_gss;
}
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+ !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+}
+
static inline int dccp_ack_pending(const struct sock *sk)
{
- const struct dccp_sock *dp = dccp_sk(sk);
- return dp->dccps_timestamp_echo != 0 ||
- (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
- inet_csk_ack_scheduled(sk);
+ return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
-extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
-extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
-extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
- struct sk_buff *skb);
-extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
-extern void dccp_feat_list_purge(struct list_head *fn_list);
-
-extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
-extern int dccp_insert_option_elapsed_time(struct sock *sk,
- struct sk_buff *skb,
- u32 elapsed_time);
-extern u32 dccp_timestamp(void);
-extern void dccp_timestamping_init(void);
-extern int dccp_insert_option_timestamp(struct sock *sk,
- struct sk_buff *skb);
-extern int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- unsigned char option,
- const void *value, unsigned char len);
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
+int dccp_feat_finalise_settings(struct dccp_sock *dp);
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+ struct sk_buff *skb);
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
+void dccp_feat_list_purge(struct list_head *fn_list);
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
+u32 dccp_timestamp(void);
+void dccp_timestamping_init(void);
+int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+ const void *value, unsigned char len);
#ifdef CONFIG_SYSCTL
-extern int dccp_sysctl_init(void);
-extern void dccp_sysctl_exit(void);
+int dccp_sysctl_init(void);
+void dccp_sysctl_exit(void);
#else
static inline int dccp_sysctl_init(void)
{
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index b21f261da75..028fc43aacb 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -48,11 +48,23 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
dccp_get_info(sk, _info);
}
+static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+}
+
+static int dccp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+}
+
static const struct inet_diag_handler dccp_diag_handler = {
- .idiag_hashinfo = &dccp_hashinfo,
+ .dump = dccp_diag_dump,
+ .dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
- .idiag_type = DCCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
+ .idiag_type = IPPROTO_DCCP,
};
static int __init dccp_diag_init(void)
@@ -71,4 +83,4 @@ module_exit(dccp_diag_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
MODULE_DESCRIPTION("DCCP inet_diag handler");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, DCCPDIAG_GETSOCK);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 972b8dc918d..9733ddbc96c 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -12,6 +12,7 @@
* -----------
* o Feature negotiation is coordinated with connection setup (as in TCP), wild
* changes of parameters of an established connection are not supported.
+ * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
* o All currently known SP features have 1-byte quantities. If in the future
* extensions of RFCs 4340..42 define features with item lengths larger than
* one byte, a feature-specific extension of the code will be required.
@@ -22,6 +23,7 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
+#include <linux/slab.h>
#include "ccid.h"
#include "feat.h"
@@ -342,6 +344,21 @@ static int __dccp_feat_activate(struct sock *sk, const int idx,
return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
}
+/**
+ * dccp_feat_activate - Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ *
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+ dccp_feat_val const *fval)
+{
+ return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
/* Test for "Req'd" feature (RFC 4340, 6.4) */
static inline int dccp_feat_must_be_understood(u8 feat_num)
{
@@ -430,6 +447,7 @@ static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
* @head: list to add to
* @feat: feature number
* @local: whether the local (1) or remote feature with number @feat is meant
+ *
* This is the only constructor and serves to ensure the above invariants.
*/
static struct dccp_feat_entry *
@@ -474,8 +492,8 @@ static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
new->feat_num = feat;
new->is_local = local;
new->state = FEAT_INITIALISING;
- new->needs_confirm = 0;
- new->empty_confirm = 0;
+ new->needs_confirm = false;
+ new->empty_confirm = false;
new->val = *fval;
new->needs_mandatory = mandatory;
@@ -488,6 +506,7 @@ static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
* @feat: one of %dccp_feature_numbers
* @local: whether local (1) or remote (0) @feat_num is being confirmed
* @fval: pointer to NN/SP value to be inserted or NULL
+ *
* Returns 0 on success, a Reset code for further processing otherwise.
*/
static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
@@ -501,12 +520,12 @@ static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
new->feat_num = feat;
new->is_local = local;
new->state = FEAT_STABLE; /* transition in 6.6.2 */
- new->needs_confirm = 1;
+ new->needs_confirm = true;
new->empty_confirm = (fval == NULL);
new->val.nn = 0; /* zeroes the whole structure */
if (!new->empty_confirm)
new->val = *fval;
- new->needs_mandatory = 0;
+ new->needs_mandatory = false;
return 0;
}
@@ -649,11 +668,22 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
return -1;
if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
return -1;
- /*
- * Enter CHANGING after transmitting the Change option (6.6.2).
- */
- if (pos->state == FEAT_INITIALISING)
- pos->state = FEAT_CHANGING;
+
+ if (skb->sk->sk_state == DCCP_OPEN &&
+ (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
+ /*
+ * Confirms don't get retransmitted (6.6.3) once the
+ * connection is in state OPEN
+ */
+ dccp_feat_list_pop(pos);
+ } else {
+ /*
+ * Enter CHANGING after transmitting the Change
+ * option (6.6.2).
+ */
+ if (pos->state == FEAT_INITIALISING)
+ pos->state = FEAT_CHANGING;
+ }
}
return 0;
}
@@ -664,6 +694,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
* @feat: an NN feature from %dccp_feature_numbers
* @mandatory: use Mandatory option if 1
* @nn_val: value to register (restricted to 4 bytes)
+ *
* Note that NN features are local by definition (RFC 4340, 6.3.2).
*/
static int __feat_register_nn(struct list_head *fn, u8 feat,
@@ -729,16 +760,72 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
0, list, len);
}
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
+/**
+ * dccp_feat_nn_get - Query current/pending value of NN feature
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ *
+ * For a known NN feature, returns value currently being negotiated, or
+ * current (confirmed) value if no negotiation is going on.
+ */
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
{
- /* any changes must be registered before establishing the connection */
- if (sk->sk_state != DCCP_CLOSED)
- return -EISCONN;
- if (dccp_feat_type(feat) != FEAT_NN)
+ if (dccp_feat_type(feat) == FEAT_NN) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *entry;
+
+ entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
+ if (entry != NULL)
+ return entry->val.nn;
+
+ switch (feat) {
+ case DCCPF_ACK_RATIO:
+ return dp->dccps_l_ack_ratio;
+ case DCCPF_SEQUENCE_WINDOW:
+ return dp->dccps_l_seq_win;
+ }
+ }
+ DCCP_BUG("attempt to look up unsupported feature %u", feat);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
+
+/**
+ * dccp_feat_signal_nn_change - Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ *
+ * This function is used to communicate NN updates out-of-band.
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ dccp_feat_val fval = { .nn = nn_val };
+ struct dccp_feat_entry *entry;
+
+ if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+ return 0;
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
return -EINVAL;
- return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
+
+ if (nn_val == dccp_feat_nn_get(sk, feat))
+ return 0; /* already set or negotiation under way */
+
+ entry = dccp_feat_list_lookup(fn, feat, 1);
+ if (entry != NULL) {
+ dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
+ (unsigned long long)entry->val.nn,
+ (unsigned long long)nn_val);
+ dccp_feat_list_pop(entry);
+ }
+
+ inet_csk_schedule_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, 0, &fval);
}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
/*
* Tracking features whose value depend on the choice of CCID
@@ -849,6 +936,7 @@ static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
* @fn: feature-negotiation list to update
* @id: CCID number to track
* @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ *
* This function needs to be called after registering all other features.
*/
static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
@@ -872,6 +960,7 @@ static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
/**
* dccp_feat_finalise_settings - Finalise settings before starting negotiation
* @dp: client or listening socket (settings will be inherited)
+ *
* This is called after all registrations (socket initialisation, sysctls, and
* sockopt calls), and before sending the first packet containing Change options
* (ie. client-Request or server-Response), to ensure internal consistency.
@@ -1074,7 +1163,7 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
}
if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
- entry->empty_confirm = 0;
+ entry->empty_confirm = false;
} else if (is_mandatory) {
return DCCP_RESET_CODE_MANDATORY_ERROR;
} else if (entry->state == FEAT_INITIALISING) {
@@ -1090,10 +1179,10 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
defval = dccp_feat_default_value(feat);
if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
return DCCP_RESET_CODE_OPTION_ERROR;
- entry->empty_confirm = 1;
+ entry->empty_confirm = true;
}
- entry->needs_confirm = 1;
- entry->needs_mandatory = 0;
+ entry->needs_confirm = true;
+ entry->needs_mandatory = false;
entry->state = FEAT_STABLE;
return 0;
@@ -1196,6 +1285,101 @@ confirmation_failed:
}
/**
+ * dccp_feat_handle_nn_established - Fast-path reception of NN options
+ * @sk: socket of an established DCCP connection
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat: NN number, one of %dccp_feature_numbers
+ * @val: NN value
+ * @len: length of @val in bytes
+ *
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ * - cleanup after receiving the Confirm;
+ * - values are directly activated after successful parsing;
+ * - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
+ */
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry;
+ u8 type = dccp_feat_type(feat);
+ dccp_feat_val fval;
+
+ dccp_feat_print_opt(opt, feat, val, len, mandatory);
+
+ /* Ignore non-mandatory unknown and non-NN features */
+ if (type == FEAT_UNKNOWN) {
+ if (local && !mandatory)
+ return 0;
+ goto fast_path_unknown;
+ } else if (type != FEAT_NN) {
+ return 0;
+ }
+
+ /*
+ * We don't accept empty Confirms, since in fast-path feature
+ * negotiation the values are enabled immediately after sending
+ * the Change option.
+ * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+ */
+ if (len == 0 || len > sizeof(fval.nn))
+ goto fast_path_unknown;
+
+ if (opt == DCCPO_CHANGE_L) {
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto fast_path_unknown;
+
+ if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+ dccp_feat_activate(sk, feat, local, &fval))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ /* set the `Ack Pending' flag to piggyback a Confirm */
+ inet_csk_schedule_ack(sk);
+
+ } else if (opt == DCCPO_CONFIRM_R) {
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL || entry->state != FEAT_CHANGING)
+ return 0;
+
+ fval.nn = dccp_decode_value_var(val, len);
+ /*
+ * Just ignore a value that doesn't match our current value.
+ * If the option changes twice within two RTTs, then at least
+ * one CONFIRM will be received for the old value after a
+ * new CHANGE was sent.
+ */
+ if (fval.nn != entry->val.nn)
+ return 0;
+
+ /* Only activate after receiving the Confirm option (6.6.1). */
+ dccp_feat_activate(sk, feat, local, &fval);
+
+ /* It has been confirmed - so remove the entry */
+ dccp_feat_list_pop(entry);
+
+ } else {
+ DCCP_WARN("Received illegal option %u\n", opt);
+ goto fast_path_failed;
+ }
+ return 0;
+
+fast_path_unknown:
+ if (!mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+fast_path_failed:
+ return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
* dccp_feat_parse_options - Process Feature-Negotiation Options
* @sk: for general use and used by the client during connection setup
* @dreq: used by the server during connection setup
@@ -1204,6 +1388,7 @@ confirmation_failed:
* @feat: one of %dccp_feature_numbers
* @val: value contents of @opt
* @len: length of @val in bytes
+ *
* Returns 0 on success, a Reset code for ending the connection otherwise.
*/
int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
@@ -1230,6 +1415,14 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
val, len, server);
}
+ break;
+ /*
+ * Support for exchanging NN options on an established connection.
+ */
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+ val, len);
}
return 0; /* ignore FN options in all other states */
}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f96721619de..0e75cebb218 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -107,14 +107,13 @@ extern unsigned long sysctl_dccp_sequence_window;
extern int sysctl_dccp_rx_ccid;
extern int sysctl_dccp_tx_ccid;
-extern int dccp_feat_init(struct sock *sk);
-extern void dccp_feat_initialise_sysctls(void);
-extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
- u8 const *list, u8 len);
-extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
-extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
- u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
+int dccp_feat_init(struct sock *sk);
+void dccp_feat_initialise_sysctls(void);
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len);
+int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+ u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
+int dccp_feat_clone_list(struct list_head const *, struct list_head *);
/*
* Encoding variable-length options and their maximum length.
@@ -128,10 +127,11 @@ extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
*/
#define DCCP_OPTVAL_MAXLEN 6
-extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
-extern u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
-extern int dccp_insert_option_mandatory(struct sk_buff *skb);
-extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
- u8 *val, u8 len, bool repeat_first);
+int dccp_insert_option_mandatory(struct sk_buff *skb);
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
+ bool repeat_first);
#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 7648f316310..3c8ec7d4a34 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
@@ -27,7 +28,7 @@ static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
static void dccp_fin(struct sock *sk, struct sk_buff *skb)
@@ -123,9 +124,9 @@ static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
return queued;
}
-static u8 dccp_reset_code_convert(const u8 code)
+static u16 dccp_reset_code_convert(const u8 code)
{
- const u8 error_code[] = {
+ const u16 error_code[] = {
[DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
[DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
[DCCP_RESET_CODE_ABORTED] = ECONNRESET,
@@ -147,7 +148,7 @@ static u8 dccp_reset_code_convert(const u8 code)
static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
{
- u8 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
+ u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
sk->sk_err = err;
@@ -159,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
}
-static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
{
- struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
- if (dp->dccps_hc_rx_ackvec != NULL)
- dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ if (av == NULL)
+ return;
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvec_input(av, skb);
}
static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -238,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
dccp_update_gsr(sk, seqno);
if (dh->dccph_type != DCCP_PKT_SYNC &&
- (ackno != DCCP_PKT_WITHOUT_ACK_SEQ))
+ ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+ after48(ackno, dp->dccps_gar))
dp->dccps_gar = ackno;
} else {
unsigned long now = jiffies;
@@ -256,9 +260,9 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
*/
if (time_before(now, (dp->dccps_rate_last +
sysctl_dccp_sync_ratelimit)))
- return 0;
+ return -1;
- DCCP_WARN("DCCP: Step 6 failed for %s packet, "
+ DCCP_WARN("Step 6 failed for %s packet, "
"(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
"(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
"sending SYNC...\n", dccp_packet_name(dh->dccph_type),
@@ -281,7 +285,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
}
static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+ const struct dccp_hdr *dh, const unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -362,24 +366,15 @@ discard:
}
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+ const struct dccp_hdr *dh, const unsigned int len)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
if (dccp_check_seqno(sk, skb))
goto discard;
if (dccp_parse_options(sk, NULL, skb))
return 1;
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
return __dccp_rcv_established(sk, skb, dh, len);
@@ -393,7 +388,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_established);
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
/*
* Step 4: Prepare sequence numbers in REQUEST
@@ -414,7 +409,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
dp->dccps_awl, dp->dccps_awh)) {
dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu \n",
+ "P.ackno=%llu, S.AWH=%llu\n",
(unsigned long long)dp->dccps_awl,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
(unsigned long long)dp->dccps_awh);
@@ -429,7 +424,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
if (dccp_parse_options(sk, NULL, skb))
return 1;
- /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
dp->dccps_options_received.dccpor_timestamp_echo));
@@ -440,20 +435,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
kfree_skb(sk->sk_send_head);
sk->sk_send_head = NULL;
- dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_update_gsr(sk, dp->dccps_isr);
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- *
- * AWL was adjusted in dccp_v4_connect -acme
+ * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+ * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+ * is done as part of activating the feature values below, since
+ * these settings depend on the local/remote Sequence Window
+ * features, which were undefined or not confirmed until now.
*/
- dccp_set_seqno(&dp->dccps_swl,
- max48(dp->dccps_swl, dp->dccps_isr));
+ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -532,8 +521,10 @@ unable_to_proceed:
static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
int queued = 0;
switch (dh->dccph_type) {
@@ -558,7 +549,14 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
if (sk->sk_state == DCCP_PARTOPEN)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(sample)) {
+ long delta = dccp_timestamp() - sample;
+
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+ }
+
+ dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_set_state(sk, DCCP_OPEN);
if (dh->dccph_type == DCCP_PKT_DATAACK ||
@@ -574,7 +572,7 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
}
int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len)
+ struct dccp_hdr *dh, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -616,30 +614,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Caller (dccp_v4_do_rcv) will send Reset */
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
+ } else if (sk->sk_state == DCCP_CLOSED) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
}
- if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 8: Process options and mark acknowledgeable
- */
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
+ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
+ goto discard;
- dccp_deliver_input_to_ccids(sk, skb);
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_RESPONSE) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+ goto discard;
}
+ /* Step 8: Process options */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
/*
* Step 9: Process Reset
* If P.type == Reset,
@@ -647,41 +651,21 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
- */
+ */
if (dh->dccph_type == DCCP_PKT_RESET) {
dccp_rcv_reset(sk, skb);
return 0;
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_RESPONSE) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND &&
- dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
if (dccp_rcv_closereq(sk, skb))
return 0;
goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
if (dccp_rcv_close(sk, skb))
return 0;
goto discard;
}
switch (sk->sk_state) {
- case DCCP_CLOSED:
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
-
case DCCP_REQUESTING:
queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
if (queued >= 0)
@@ -690,8 +674,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
__kfree_skb(skb);
return 0;
- case DCCP_RESPOND:
case DCCP_PARTOPEN:
+ /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
+ /* fall through */
+ case DCCP_RESPOND:
queued = dccp_rcv_respond_partopen_state_process(sk, skb,
dh, len);
break;
@@ -722,6 +710,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
/**
* dccp_sample_rtt - Validate and finalise computation of RTT sample
* @delta: number of microseconds between packet and acknowledgment
+ *
* The routine is kept generic to work in different contexts. It should be
* called immediately when the ACK used for the RTT sample arrives.
*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 4071eaf2b36..6ca645c4b48 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/icmp.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/random.h>
@@ -25,6 +26,7 @@
#include <net/timewait_sock.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "ackvec.h"
#include "ccid.h"
@@ -39,13 +41,15 @@
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct rtable *rt;
+ __be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
- int tmp;
+ struct flowi4 *fl4;
+ struct rtable *rt;
int err;
+ struct ip_options_rcu *inet_opt;
dp->dccps_role = DCCP_ROLE_CLIENT;
@@ -56,37 +60,43 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt != NULL && inet->opt->srr) {
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt != NULL && inet_opt->opt.srr) {
if (daddr == 0)
return -EINVAL;
- nexthop = inet->opt->faddr;
+ nexthop = inet_opt->opt.faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (tmp < 0)
- return tmp;
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
- if (inet->opt == NULL || !inet->opt->srr)
- daddr = rt->rt_dst;
+ if (inet_opt == NULL || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
if (inet->inet_saddr == 0)
- inet->inet_saddr = rt->rt_src;
+ inet->inet_saddr = fl4->saddr;
inet->inet_rcv_saddr = inet->inet_saddr;
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt != NULL)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
/*
* Socket identity is still unknown (sport may be zero).
* However we set state to DCCP_REQUESTING and not releasing socket
@@ -98,13 +108,15 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err != 0)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_DCCP, inet->inet_sport,
- inet->inet_dport, sk);
- if (err != 0)
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
@@ -149,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
if (sk->sk_state == DCCP_LISTEN)
return;
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
@@ -169,6 +174,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
dccp_sync_mss(sk, mtu);
@@ -183,6 +189,14 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
} /* else let the usual retransmit timer handle it */
}
+static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
/*
* This routine is called by the ICMP module when it gets some sort of error
* condition. If err < 0 then the socket should be closed and the error
@@ -247,6 +261,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
}
switch (type) {
+ case ICMP_REDIRECT:
+ dccp_do_redirect(skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -288,7 +305,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
*/
WARN_ON(req->sk);
- if (seq != dccp_rsk(req)->dreq_iss) {
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -348,7 +366,7 @@ static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
}
-void dccp_v4_send_check(struct sock *sk, int unused, struct sk_buff *skb)
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
const struct inet_sock *inet = inet_sk(sk);
struct dccp_hdr *dh = dccp_hdr(skb);
@@ -386,39 +404,45 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto exit;
-
- sk_setup_caps(newsk, dst);
+ goto exit_nonewsk;
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->rmt_addr;
- newinet->inet_rcv_saddr = ireq->loc_addr;
- newinet->inet_saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
+ newinet->inet_daddr = ireq->ir_rmt_addr;
+ newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ newinet->inet_opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->inet_id = jiffies;
+ if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+ goto put_and_exit;
+
+ sk_setup_caps(newsk, dst);
+
dccp_sync_mss(newsk, dst_mtu(dst));
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
__inet_hash_nolisten(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- dst_release(dst);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto exit;
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
@@ -456,35 +480,35 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct rtable *rt;
- struct flowi fl = { .oif = skb_rtable(skb)->rt_iif,
- .nl_u = { .ip4_u =
- { .daddr = ip_hdr(skb)->saddr,
- .saddr = ip_hdr(skb)->daddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = dccp_hdr(skb)->dccph_dport,
- .dport = dccp_hdr(skb)->dccph_sport }
- }
- };
-
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_flow(net, &rt, &fl, sk, 0)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .flowi4_oif = inet_iif(skb),
+ .daddr = iph->saddr,
+ .saddr = iph->daddr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .fl4_sport = dccp_hdr(skb)->dccph_dport,
+ .fl4_dport = dccp_hdr(skb)->dccph_sport,
+ };
+
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
- return &rt->u.dst;
+ return &rt->dst;
}
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
- struct request_values *rv_unused)
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
{
int err = -1;
struct sk_buff *skb;
struct dst_entry *dst;
+ struct flowi4 fl4;
- dst = inet_csk_route_req(sk, req);
+ dst = inet_csk_route_req(sk, &fl4, req);
if (dst == NULL)
goto out;
@@ -493,10 +517,10 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
const struct inet_request_sock *ireq = inet_rsk(req);
struct dccp_hdr *dh = dccp_hdr(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
- ireq->rmt_addr);
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
+ dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
}
@@ -554,6 +578,11 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
+void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+}
+EXPORT_SYMBOL(dccp_syn_ack_timeout);
+
static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct dccp_request_sock),
@@ -561,6 +590,7 @@ static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
.send_ack = dccp_reqsk_send_ack,
.destructor = dccp_v4_reqsk_destructor,
.send_reset = dccp_v4_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
};
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -612,22 +642,23 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
ireq = inet_rsk(req);
- ireq->loc_addr = ip_hdr(skb)->daddr;
- ireq->rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
/*
* Step 3: Process LISTEN state
*
* Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
*
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
*/
dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
dreq->dreq_iss = dccp_v4_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v4_send_response(sk, req, NULL))
+ if (dccp_v4_send_response(sk, req))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -958,6 +989,7 @@ static const struct net_protocol dccp_v4_protocol = {
.err_handler = dccp_v4_err,
.no_policy = 1,
.netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
static const struct proto_ops inet_dccp_ops = {
@@ -992,7 +1024,6 @@ static struct inet_protosw dccp_v4_protosw = {
.protocol = IPPROTO_DCCP,
.prot = &dccp_v4_prot,
.ops = &inet_dccp_ops,
- .no_check = 0,
.flags = INET_PROTOSW_ICSK,
};
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index af3394df63b..4db3c2a1679 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/xfrm.h>
#include <net/addrconf.h>
@@ -28,6 +29,7 @@
#include <net/transp_v6.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "dccp.h"
#include "ipv6.h"
@@ -53,29 +55,22 @@ static void dccp_v6_hash(struct sock *sk)
/* add pseudo-header to DCCP checksum stored in skb->csum */
static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
- struct in6_addr *saddr,
- struct in6_addr *daddr)
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
{
return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
}
-static inline void dccp_v6_send_check(struct sock *sk, int unused_value,
- struct sk_buff *skb)
+static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_hdr *dh = dccp_hdr(skb);
dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr);
+ dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
}
-static inline __u32 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport )
-{
- return secure_tcpv6_sequence_number(saddr, daddr, sport, dport);
-}
-
-static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
+static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
{
return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
@@ -87,7 +82,7 @@ static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
- struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
struct dccp_sock *dp;
struct ipv6_pinfo *np;
@@ -135,51 +130,31 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
np = inet6_sk(sk);
+ if (type == NDISC_REDIRECT) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ goto out;
+ }
+
if (type == ICMPV6_PKT_TOOBIG) {
struct dst_entry *dst = NULL;
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+
if (sock_owned_by_user(sk))
goto out;
if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
goto out;
- /* icmp should have updated the destination cache entry */
- dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
-
- /* BUGGG_FUTURE: Again, it is not clear how
- to handle rthdr case. Ignore this complexity
- for now.
- */
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->inet_dport;
- fl.fl_ip_sport = inet->inet_sport;
- security_sk_classify_flow(sk, &fl);
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- err = xfrm_lookup(net, &dst, &fl, sk, 0);
- if (err < 0) {
- sk->sk_err_soft = -err;
- goto out;
- }
- } else
- dst_hold(dst);
+ dst = inet6_csk_update_pmtu(sk, ntohl(info));
+ if (!dst)
+ goto out;
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
dccp_sync_mss(sk, dst_mtu(dst));
- } /* else let the usual retransmit timer handle it */
- dst_release(dst);
goto out;
}
@@ -204,7 +179,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
*/
WARN_ON(req->sk != NULL);
- if (seq != dccp_rsk(req)->dreq_iss) {
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -241,64 +217,49 @@ out:
}
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
- struct request_values *rv_unused)
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
- struct ipv6_txoptions *opt = NULL;
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
int err = -1;
struct dst_entry *dst;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = ireq6->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_req_classify_flow(req, &fl);
-
- opt = np->opt;
-
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = ireq->ir_iif;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0);
- if (err < 0)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto done;
+ }
skb = dccp_make_response(sk, dst, req);
if (skb != NULL) {
struct dccp_hdr *dh = dccp_hdr(skb);
dh->dccph_checksum = dccp_v6_csum_finish(skb,
- &ireq6->loc_addr,
- &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt, 0);
+ &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
err = net_xmit_eval(err);
}
done:
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
dst_release(dst);
return err;
}
@@ -306,15 +267,14 @@ done:
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
- if (inet6_rsk(req)->pktopts != NULL)
- kfree_skb(inet6_rsk(req)->pktopts);
+ kfree_skb(inet_rsk(req)->pktopts);
}
static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
{
- struct ipv6hdr *rxip6h;
+ const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
- struct flowi fl;
+ struct flowi6 fl6;
struct net *net = dev_net(skb_dst(rxskb)->dev);
struct sock *ctl_sk = net->dccp.v6_ctl_sk;
struct dst_entry *dst;
@@ -333,25 +293,24 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
&rxip6h->daddr);
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxip6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxip6h->daddr);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = rxip6h->saddr;
+ fl6.saddr = rxip6h->daddr;
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dccp_hdr(skb)->dccph_dport;
- fl.fl_ip_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, &fl);
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.flowi6_oif = inet6_iif(rxskb);
+ fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+ fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+ security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) {
- if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) {
- skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- return;
- }
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(skb, dst);
+ ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
}
kfree_skb(skb);
@@ -364,6 +323,7 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.send_ack = dccp_reqsk_send_ack,
.destructor = dccp_v6_reqsk_destructor,
.send_reset = dccp_v6_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
};
static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
@@ -401,7 +361,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct request_sock *req;
struct dccp_request_sock *dreq;
- struct inet6_request_sock *ireq6;
+ struct inet_request_sock *ireq;
struct ipv6_pinfo *np = inet6_sk(sk);
const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -440,36 +400,37 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- ireq6 = inet6_rsk(req);
- ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
- ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
+ ireq = inet_rsk(req);
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users);
- ireq6->pktopts = skb;
+ ireq->pktopts = skb;
}
- ireq6->iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq6->iif = inet6_iif(skb);
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = inet6_iif(skb);
/*
* Step 3: Process LISTEN state
*
* Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
*
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
*/
dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
dreq->dreq_iss = dccp_v6_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v6_send_response(sk, req, NULL))
+ if (dccp_v6_send_response(sk, req))
goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -487,13 +448,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
struct request_sock *req,
struct dst_entry *dst)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct inet_sock *newinet;
- struct dccp_sock *newdp;
struct dccp6_sock *newdp6;
struct sock *newsk;
- struct ipv6_txoptions *opt;
if (skb->protocol == htons(ETH_P_IP)) {
/*
@@ -504,18 +463,17 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return NULL;
newdp6 = (struct dccp6_sock *)newsk;
- newdp = dccp_sk(newsk);
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
+ ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr);
ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
- ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+ newsk->sk_v6_rcv_saddr = newnp->saddr;
inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
@@ -539,44 +497,32 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return newsk;
}
- opt = np->opt;
if (sk_acceptq_is_full(sk))
goto out_overflow;
if (dst == NULL) {
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_sk_classify_flow(sk, &fl);
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst))
goto out;
}
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto out;
+ goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -590,31 +536,30 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newdp6 = (struct dccp6_sock *)newsk;
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
- newdp = dccp_sk(newsk);
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
- newsk->sk_bound_dev_if = ireq6->iif;
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newnp->saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
/* Now IPv6 options...
First: no IPv4 options.
*/
- newinet->opt = NULL;
+ newinet->inet_opt = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (ireq6->pktopts != NULL) {
- newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
- kfree_skb(ireq6->pktopts);
- ireq6->pktopts = NULL;
+ if (ireq->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
if (newnp->pktoptions)
skb_set_owner_r(newnp->pktoptions, newsk);
}
@@ -628,11 +573,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- if (opt != NULL) {
- newnp->opt = ipv6_dup_options(newsk, opt);
- if (opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- }
+ if (np->opt != NULL)
+ newnp->opt = ipv6_dup_options(newsk, np->opt);
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newnp->opt != NULL)
@@ -644,18 +586,21 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto out;
+ }
__inet6_hash(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
out_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+ dst_release(dst);
out:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- dst_release(dst);
return NULL;
}
@@ -885,8 +830,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p = NULL, final;
- struct flowi fl;
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
int err;
@@ -899,17 +844,16 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+ fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
fl6_sock_release(flowlabel);
}
}
@@ -942,8 +886,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
return -EINVAL;
}
- ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
/*
* DCCP over IPv4
@@ -972,52 +916,37 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
goto failure;
}
ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
- ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr);
+ ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr);
return err;
}
- if (!ipv6_addr_any(&np->rcv_saddr))
- saddr = &np->rcv_saddr;
-
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->inet_sport;
- security_sk_classify_flow(sk, &fl);
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
- if (np->opt != NULL && np->opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto failure;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto failure;
}
if (saddr == NULL) {
- saddr = &fl.fl6_src;
- ipv6_addr_copy(&np->rcv_saddr, saddr);
+ saddr = &fl6.saddr;
+ sk->sk_v6_rcv_saddr = *saddr;
}
/* set the source address */
- ipv6_addr_copy(&np->saddr, saddr);
+ np->saddr = *saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
__ip6_dst_store(sk, dst, NULL, NULL);
@@ -1035,7 +964,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
goto late_failure;
dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
- np->daddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
inet->inet_dport);
err = dccp_connect(sk);
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
index 6eef81fdbe5..af259e15e7f 100644
--- a/net/dccp/ipv6.h
+++ b/net/dccp/ipv6.h
@@ -25,12 +25,10 @@ struct dccp6_sock {
struct dccp6_request_sock {
struct dccp_request_sock dccp;
- struct inet6_request_sock inet6;
};
struct dccp6_timewait_sock {
struct inet_timewait_sock inet;
- struct inet6_timewait_sock tw6;
};
#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 0d508c359fa..c69eb9c4fbb 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -11,6 +11,7 @@
*/
#include <linux/dccp.h>
+#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/timer.h>
@@ -52,15 +53,12 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
const struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -99,7 +97,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
* (* Generate a new socket and switch to that socket *)
* Set S := new socket for this port pair
*/
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
struct dccp_request_sock *dreq = dccp_rsk(req);
@@ -120,30 +118,20 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
*
* Choose S.ISS (initial seqno) or set from Init Cookies
* Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Set S.ISR, S.GSR from packet (or Init Cookies)
+ *
+ * Setting AWL/AWH and SWL/SWH happens as part of the feature
+ * activation below, as these windows all depend on the local
+ * and remote Sequence Window feature values (7.5.2).
*/
- newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
- dccp_update_gss(newsk, dreq->dreq_iss);
-
+ newdp->dccps_iss = dreq->dreq_iss;
+ newdp->dccps_gss = dreq->dreq_gss;
+ newdp->dccps_gar = newdp->dccps_iss;
newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_set_seqno(&newdp->dccps_swl,
- max48(newdp->dccps_swl, newdp->dccps_isr));
- dccp_set_seqno(&newdp->dccps_awl,
- max48(newdp->dccps_awl, newdp->dccps_iss));
+ newdp->dccps_gsr = dreq->dreq_gsr;
/*
- * Activate features after initialising the sequence numbers,
- * since CCID initialisation may depend on GSS, ISR, ISS etc.
+ * Activate features: initialise CCIDs, sequence windows etc.
*/
if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
/* It is still raw copy of parent, so invalidate
@@ -175,16 +163,15 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) {
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
dccp_pr_debug("Retransmitted REQUEST\n");
- dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
/*
* Send another RESPONSE packet
* To protect against Request floods, increment retrans
* counter (backoff, monitored by dccp_response_timer).
*/
- req->retrans++;
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ inet_rtx_syn_ack(sk, req);
}
/* Network Duplicate, discard packet */
return NULL;
@@ -197,12 +184,14 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
goto drop;
/* Invalid ACK */
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) {
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dreq->dreq_iss, dreq->dreq_gss)) {
dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu\n",
+ "dreq_iss=%llu, dreq_gss=%llu\n",
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dreq->dreq_iss);
+ (unsigned long long) dreq->dreq_iss,
+ (unsigned long long) dreq->dreq_gss);
goto drop;
}
@@ -248,7 +237,7 @@ int dccp_child_process(struct sock *parent, struct sock *child,
/* Wakeup parent, send SIGIO */
if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
@@ -277,10 +266,10 @@ int dccp_reqsk_init(struct request_sock *req,
{
struct dccp_request_sock *dreq = dccp_rsk(req);
- inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->loc_port = dccp_hdr(skb)->dccph_dport;
- inet_rsk(req)->acked = 0;
- dreq->dreq_timestamp_echo = 0;
+ inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
+ inet_rsk(req)->acked = 0;
+ dreq->dreq_timestamp_echo = 0;
/* inherit feature negotiation options from listening socket */
return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 1b08cae9c65..9bce31886bd 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
struct dccp_sock *dp = dccp_sk(sk);
const struct dccp_hdr *dh = dccp_hdr(skb);
const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
unsigned char *opt_ptr = options;
const unsigned char *opt_end = (unsigned char *)dh +
@@ -96,18 +95,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
}
/*
- * CCID-Specific Options (from RFC 4340, sec. 10.3):
- *
- * Option numbers 128 through 191 are for options sent from the
- * HC-Sender to the HC-Receiver; option numbers 192 through 255
- * are for options sent from the HC-Receiver to the HC-Sender.
- *
* CCID-specific options are ignored during connection setup, as
* negotiation may still be in progress (see RFC 4340, 10.3).
* The same applies to Ack Vectors, as these depend on the CCID.
- *
*/
- if (dreq != NULL && (opt >= 128 ||
+ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
goto ignore_option;
@@ -131,19 +123,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
break;
+ if (len == 0)
+ goto out_invalid_option;
rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
*value, value + 1, len - 1);
if (rc)
goto out_featneg_failed;
break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
- break;
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
- goto out_invalid_option;
- break;
case DCCPO_TIMESTAMP:
if (len != 4)
goto out_invalid_option;
@@ -170,6 +156,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_role(sk), ntohl(opt_val),
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /* schedule an Ack in case this sender is quiescent */
+ inet_csk_schedule_ack(sk);
break;
case DCCPO_TIMESTAMP_ECHO:
if (len != 4 && len != 6 && len != 8)
@@ -226,23 +214,25 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
dccp_role(sk), elapsed_time);
break;
- case 128 ... 191: {
- const u16 idx = value - options;
-
+ case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
- case 192 ... 255: {
- const u16 idx = value - options;
-
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ break;
+ /*
+ * Ack vectors are processed by the TX CCID if it is
+ * interested. The RX CCID need not parse Ack Vectors,
+ * since it is only interested in clearing old state.
+ * Fall through.
+ */
+ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
default:
DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -296,12 +286,11 @@ static inline u8 dccp_ndp_len(const u64 ndp)
{
if (likely(ndp <= 0xFF))
return 1;
- return likely(ndp <= USHORT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
+ return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
}
-int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- const unsigned char option,
- const void *value, const unsigned char len)
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+ const void *value, const unsigned char len)
{
unsigned char *to;
@@ -354,49 +343,15 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
}
-int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb,
- u32 elapsed_time)
-{
- const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- const int len = 2 + elapsed_time_len;
- unsigned char *to;
-
- if (elapsed_time_len == 0)
- return 0;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_ELAPSED_TIME;
- *to++ = len;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
-
-int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
{
__be32 now = htonl(dccp_timestamp());
/* yes this will overflow but that is the point as we want a
* 10 usec 32 bit timer which mean it wraps every 11.9 hours */
- return dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+ return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
}
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
struct dccp_request_sock *dreq,
struct sk_buff *skb)
@@ -441,6 +396,83 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
return 0;
}
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const u16 buflen = dccp_ackvec_buflen(av);
+ /* Figure out how many options do we need to represent the ackvec */
+ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+ u16 len = buflen + 2 * nr_opts;
+ u8 i, nonce = 0;
+ const unsigned char *tail, *from;
+ unsigned char *to;
+
+ if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+ dccp_packet_name(dcb->dccpd_type));
+ return -1;
+ }
+ /*
+ * Since Ack Vectors are variable-length, we can not always predict
+ * their size. To catch exception cases where the space is running out
+ * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+ */
+ if (len > DCCPAV_MIN_OPTLEN &&
+ len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+ DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+ "MPS=%u ==> reduce payload size?\n", len, skb->len,
+ dcb->dccpd_opt_len, dp->dccps_mss_cache);
+ dp->dccps_sync_scheduled = 1;
+ return 0;
+ }
+ dcb->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ len = buflen;
+ from = av->av_buf + av->av_buf_head;
+ tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+ for (i = 0; i < nr_opts; ++i) {
+ int copylen = len;
+
+ if (len > DCCP_SINGLE_OPT_MAXLEN)
+ copylen = DCCP_SINGLE_OPT_MAXLEN;
+
+ /*
+ * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+ * its type; ack_nonce is the sum of all individual buf_nonce's.
+ */
+ nonce ^= av->av_buf_nonce[i];
+
+ *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+ *to++ = copylen + 2;
+
+ /* Check if buf_head wraps */
+ if (from + copylen > tail) {
+ const u16 tailsize = tail - from;
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ copylen -= tailsize;
+ from = av->av_buf;
+ }
+
+ memcpy(to, from, copylen);
+ from += copylen;
+ to += copylen;
+ len -= copylen;
+ }
+ /*
+ * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+ */
+ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
+ return -ENOBUFS;
+ return 0;
+}
+
/**
* dccp_insert_option_mandatory - Mandatory option (5.8.2)
* Note that since we are using skb_push, this function needs to be called
@@ -463,6 +495,7 @@ int dccp_insert_option_mandatory(struct sk_buff *skb)
* @val: NN value or SP array (preferred element first) to copy
* @len: true length of @val in bytes (excluding first element repetition)
* @repeat_first: whether to copy the first element of @val twice
+ *
* The last argument is used to construct Confirm options, where the preferred
* value and the preference list appear separately (RFC 4340, 6.3.1). Preference
* lists are kept such that the preferred entry is always first, so we only need
@@ -480,7 +513,7 @@ int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
}
if (unlikely(val == NULL || len == 0))
- len = repeat_first = 0;
+ len = repeat_first = false;
tot_len = 3 + repeat_first + len;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
@@ -531,13 +564,12 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
/*
* Obtain RTT sample from Request/Response exchange.
- * This is currently used in CCID 3 initialisation.
+ * This is currently used for TFRC initialisation.
*/
- if (dccp_insert_option_timestamp(sk, skb))
+ if (dccp_insert_option_timestamp(skb))
return -1;
- } else if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+ } else if (dccp_ackvec_pending(sk) &&
dccp_insert_option_ackvec(sk, skb)) {
return -1;
}
@@ -564,6 +596,10 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
if (dccp_feat_insert_opts(NULL, dreq, skb))
return -1;
+ /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
if (dreq->dreq_timestamp_echo != 0 &&
dccp_insert_option_timestamp_echo(NULL, dreq, skb))
return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index d6bb753bf6a..0248e8a3460 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -13,6 +13,7 @@
#include <linux/dccp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/inet_sock.h>
#include <net/sock.h>
@@ -26,11 +27,13 @@ static inline void dccp_event_ack_sent(struct sock *sk)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
+static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
skb_set_owner_w(skb, sk);
WARN_ON(sk->sk_send_head);
sk->sk_send_head = skb;
+ return skb_clone(sk->sk_send_head, gfp_any());
}
/*
@@ -42,7 +45,7 @@ static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
- const struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -128,21 +131,21 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- icsk->icsk_af_ops->send_check(sk, 0, skb);
+ icsk->icsk_af_ops->send_check(sk, skb);
if (set_ack)
dccp_event_ack_sent(sk);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- err = icsk->icsk_af_ops->queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
return net_xmit_eval(err);
}
return -ENOBUFS;
}
/**
- * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits
+ * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
* We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
* since the RX CCID is restricted to feedback packets (Acks), which are small
* in comparison with the data traffic. A value of 0 means "no current CCMPS".
@@ -194,120 +197,173 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss);
void dccp_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
- if (sk_has_sleeper(sk))
- wake_up_interruptible(sk->sk_sleep);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
}
/**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * dccp_wait_for_ccid - Await CCID send permission
* @sk: socket to wait for
- * @skb: current skb to pass on for waiting
- * @delay: sleep timeout in milliseconds (> 0)
- * This function is called by default when the socket is closed, and
- * when a non-zero linger time is set on the socket. For consistency
+ * @delay: timeout in jiffies
+ *
+ * This is used by CCIDs which need to delay the send time in process context.
*/
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
{
- struct dccp_sock *dp = dccp_sk(sk);
DEFINE_WAIT(wait);
- unsigned long jiffdelay;
- int rc;
+ long remaining;
- do {
- dccp_pr_debug("delayed send by %d msec\n", delay);
- jiffdelay = msecs_to_jiffies(delay);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ release_sock(sk);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ remaining = schedule_timeout(delay);
- sk->sk_write_pending++;
- release_sock(sk);
- schedule_timeout(jiffdelay);
- lock_sock(sk);
- sk->sk_write_pending--;
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ finish_wait(sk_sleep(sk), &wait);
- if (sk->sk_err)
- goto do_error;
- if (signal_pending(current))
- goto do_interrupted;
+ if (signal_pending(current) || sk->sk_err)
+ return -1;
+ return remaining;
+}
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- } while ((delay = rc) > 0);
-out:
- finish_wait(sk->sk_sleep, &wait);
- return rc;
-
-do_error:
- rc = -EPIPE;
- goto out;
-do_interrupted:
- rc = -EINTR;
- goto out;
+/**
+ * dccp_xmit_packet - Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+ int err, len;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb = dccp_qpolicy_pop(sk);
+
+ if (unlikely(skb == NULL))
+ return;
+ len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+ /*
+ * See 8.1.5 - Handshake Completion.
+ *
+ * For robustness we resend Confirm options until the client has
+ * entered OPEN. During the initial feature negotiation, the MPS
+ * is smaller than usual, reduced by the Change/Confirm options.
+ */
+ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+ DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+ dccp_send_ack(sk);
+ dccp_feat_list_purge(&dp->dccps_featneg);
+ }
+
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else if (dccp_ack_pending(sk)) {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+ }
+
+ err = dccp_transmit_skb(sk, skb);
+ if (err)
+ dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+ /*
+ * Register this one as sent even if an error occurred. To the remote
+ * end a local packet drop is indistinguishable from network loss, i.e.
+ * any local drop will eventually be reported via receiver feedback.
+ */
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+ /*
+ * If the CCID needs to transfer additional header options out-of-band
+ * (e.g. Ack Vectors or feature-negotiation options), it activates this
+ * flag to schedule a Sync. The Sync will automatically incorporate all
+ * currently pending header options, thus clearing the backlog.
+ */
+ if (dp->dccps_sync_scheduled)
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
}
-void dccp_write_xmit(struct sock *sk, int block)
+/**
+ * dccp_flush_write_queue - Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
{
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
+ long delay, rc;
- while ((skb = skb_peek(&sk->sk_write_queue))) {
- int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- if (err > 0) {
- if (!block) {
- sk_reset_timer(sk, &dp->dccps_xmit_timer,
- msecs_to_jiffies(err)+jiffies);
- break;
- } else
- err = dccp_wait_for_ccid(sk, skb, err);
- if (err && err != -EINTR)
- DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
- }
+ while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- skb_dequeue(&sk->sk_write_queue);
- if (err == 0) {
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int len = skb->len;
-
- if (sk->sk_state == DCCP_PARTOPEN) {
- const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
- /*
- * See 8.1.5 - Handshake Completion.
- *
- * For robustness we resend Confirm options until the client has
- * entered OPEN. During the initial feature negotiation, the MPS
- * is smaller than usual, reduced by the Change/Confirm options.
- */
- if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
- DCCP_WARN("Payload too large (%d) for featneg.\n", len);
- dccp_send_ack(sk);
- dccp_feat_list_purge(&dp->dccps_featneg);
- }
-
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk))
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- else
- dcb->dccpd_type = DCCP_PKT_DATA;
-
- err = dccp_transmit_skb(sk, skb);
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
- if (err)
- DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
- err);
- } else {
- dccp_pr_debug("packet discarded due to err=%d\n", err);
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ /*
+ * If the CCID determines when to send, the next sending
+ * time is unknown or the CCID may not even send again
+ * (e.g. remote host crashes or lost Ack packets).
+ */
+ DCCP_WARN("CCID did not manage to send all packets\n");
+ return;
+ case CCID_PACKET_DELAY:
+ delay = msecs_to_jiffies(rc);
+ if (delay > *time_budget)
+ return;
+ rc = dccp_wait_for_ccid(sk, delay);
+ if (rc < 0)
+ return;
+ *time_budget -= (delay - rc);
+ /* check again if we can send now */
+ break;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ skb_dequeue(&sk->sk_write_queue);
kfree_skb(skb);
+ dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+ }
+ }
+}
+
+void dccp_write_xmit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = dccp_qpolicy_top(sk))) {
+ int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ return;
+ case CCID_PACKET_DELAY:
+ sk_reset_timer(sk, &dp->dccps_xmit_timer,
+ jiffies + msecs_to_jiffies(rc));
+ return;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ dccp_qpolicy_drop(sk, skb);
+ dccp_pr_debug("packet discarded due to err=%d\n", rc);
}
}
}
@@ -353,10 +409,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
skb_dst_set(skb, dst_clone(dst));
dreq = dccp_rsk(req);
- if (inet_rsk(req)->acked) /* increase ISS upon retransmission */
- dccp_inc_seqno(&dreq->dreq_iss);
+ if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
+ dccp_inc_seqno(&dreq->dreq_gss);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
+ DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
/* Resolve feature dependencies resulting from choice of CCID */
if (dccp_feat_server_ccid_dependencies(dreq))
@@ -368,14 +424,14 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
/* Build and checksum header */
dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_sport = inet_rsk(req)->loc_port;
- dh->dccph_dport = inet_rsk(req)->rmt_port;
+ dh->dccph_sport = htons(inet_rsk(req)->ir_num);
+ dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
dh->dccph_doff = (dccp_header_size +
DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
dh->dccph_type = DCCP_PKT_RESPONSE;
dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_iss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
+ dccp_hdr_set_seq(dh, dreq->dreq_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
dccp_csum_outgoing(skb);
@@ -471,8 +527,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
{
+ struct sk_buff *skb;
struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -482,22 +539,12 @@ static inline void dccp_connect_init(struct sock *sk)
dccp_sync_mss(sk, dst_mtu(dst));
- /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
- dp->dccps_gar = dp->dccps_iss;
-
- icsk->icsk_retransmits = 0;
-}
-
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
/* do not connect if feature negotiation setup fails */
if (dccp_feat_finalise_settings(dccp_sk(sk)))
return -EPROTO;
- dccp_connect_init(sk);
+ /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+ dp->dccps_gar = dp->dccps_iss;
skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
if (unlikely(skb == NULL))
@@ -508,11 +555,11 @@ int dccp_connect(struct sock *sk)
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
/* Timer for repeating the REQUEST until an answer. */
+ icsk->icsk_retransmits = 0;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, DCCP_RTO_MAX);
return 0;
@@ -599,6 +646,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+ /*
+ * Clear the flag in case the Sync was scheduled for out-of-band data,
+ * such as carrying a long Ack Vector.
+ */
+ dccp_sk(sk)->dccps_sync_scheduled = 0;
+
dccp_transmit_skb(sk, skb);
}
@@ -627,9 +680,7 @@ void dccp_send_close(struct sock *sk, const int active)
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
if (active) {
- dccp_write_xmit(sk, 1);
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, prio));
+ skb = dccp_skb_entail(sk, skb);
/*
* Retransmission timer for active-close: RFC 4340, 8.3 requires
* to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -642,6 +693,6 @@ void dccp_send_close(struct sock *sk, const int active)
*/
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
- } else
- dccp_transmit_skb(sk, skb);
+ }
+ dccp_transmit_skb(sk, skb);
}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index f5b3464f124..595ddf0459d 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/kfifo.h>
#include <linux/vmalloc.h>
+#include <linux/gfp.h>
#include <net/net_namespace.h>
#include "dccp.h"
@@ -148,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
.owner = THIS_MODULE,
.open = dccpprobe_open,
.read = dccpprobe_read,
+ .llseek = noop_llseek,
};
static __init int dccpprobe_init(void)
@@ -158,18 +160,23 @@ static __init int dccpprobe_init(void)
spin_lock_init(&dccpw.lock);
if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
return ret;
- if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops))
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
goto err0;
- try_then_request_module((ret = register_jprobe(&dccp_send_probe)) == 0,
- "dccp");
+ ret = register_jprobe(&dccp_send_probe);
+ if (ret) {
+ ret = request_module("dccp");
+ if (!ret)
+ ret = register_jprobe(&dccp_send_probe);
+ }
+
if (ret)
goto err1;
pr_info("DCCP watch registered (port=%d)\n", port);
return 0;
err1:
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
err0:
kfifo_free(&dccpw.fifo);
return ret;
@@ -179,7 +186,7 @@ module_init(dccpprobe_init);
static __exit void dccpprobe_exit(void)
{
kfifo_free(&dccpw.fifo);
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&dccp_send_probe);
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index aa4cef374fd..de2c1e71930 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -20,6 +20,7 @@
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <net/checksum.h>
#include <net/inet_sock.h>
@@ -49,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
/* the maximum queue length for tx in packets. 0 is no limit */
int sysctl_dccp_tx_qlen __read_mostly = 5;
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+ static const char *const dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
+ [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+#endif
+
void dccp_set_state(struct sock *sk, const int state)
{
const int oldstate = sk->sk_state;
@@ -145,30 +170,6 @@ const char *dccp_packet_name(const int type)
EXPORT_SYMBOL_GPL(dccp_packet_name);
-const char *dccp_state_name(const int state)
-{
- static const char *const dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
- [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -183,7 +184,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
dp->dccps_rate_last = jiffies;
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
+ dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
dccp_init_xmit_timers(sk);
@@ -311,7 +312,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
unsigned int mask;
struct sock *sk = sock->sk;
- sock_poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -335,7 +336,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
@@ -346,7 +347,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
* wspace test but before the flags are set,
* IO signal will be lost.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
}
@@ -472,14 +473,9 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type,
if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
return -EINVAL;
- val = kmalloc(optlen, GFP_KERNEL);
- if (val == NULL)
- return -ENOMEM;
-
- if (copy_from_user(val, optval, optlen)) {
- kfree(val);
- return -EFAULT;
- }
+ val = memdup_user(optval, optlen);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
lock_sock(sk);
if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
@@ -536,6 +532,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_RECV_CSCOV:
err = dccp_setsockopt_cscov(sk, val, true);
break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ if (sk->sk_state != DCCP_CLOSED)
+ err = -EISCONN;
+ else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+ err = -EINVAL;
+ else
+ dp->dccps_qpolicy = val;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ if (val < 0)
+ err = -EINVAL;
+ else
+ dp->dccps_tx_qlen = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -643,6 +653,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_RECV_CSCOV:
val = dp->dccps_pcrlen;
break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ val = dp->dccps_qpolicy;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ val = dp->dccps_tx_qlen;
+ break;
case 128 ... 191:
return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
len, (u32 __user *)optval, optlen);
@@ -685,6 +701,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
#endif
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+ /*
+ * Assign an (opaque) qpolicy priority value to skb->priority.
+ *
+ * We are overloading this skb field for use with the qpolicy subystem.
+ * The skb->priority is normally used for the SO_PRIORITY option, which
+ * is initialised from sk_priority. Since the assignment of sk_priority
+ * to skb->priority happens later (on layer 3), we overload this field
+ * for use with queueing priorities as long as the skb is on layer 4.
+ * The default priority value (if nothing is set) is 0.
+ */
+ skb->priority = 0;
+
+ for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_DCCP)
+ continue;
+
+ if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+ !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+ return -EINVAL;
+
+ switch (cmsg->cmsg_type) {
+ case DCCP_SCM_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+ return -EINVAL;
+ skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -700,8 +757,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);
- if (sysctl_dccp_tx_qlen &&
- (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+ if (dccp_qpolicy_full(sk)) {
rc = -EAGAIN;
goto out_release;
}
@@ -729,8 +785,18 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (rc != 0)
goto out_discard;
- skb_queue_tail(&sk->sk_write_queue, skb);
- dccp_write_xmit(sk,0);
+ rc = dccp_msghdr_parse(msg, skb);
+ if (rc != 0)
+ goto out_discard;
+
+ dccp_qpolicy_push(sk, skb);
+ /*
+ * The xmit_timer is set if the TX CCID is rate-based and will expire
+ * when congestion control permits to release further packets into the
+ * network. Window-based CCIDs do not use this timer.
+ */
+ if (!timer_pending(&dp->dccps_xmit_timer))
+ dccp_write_xmit(sk);
out_release:
release_sock(sk);
return rc ? : len;
@@ -782,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
default:
dccp_pr_debug("packet_type=%s\n",
dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
}
verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
@@ -839,7 +905,7 @@ verify_sock_status:
len = skb->len;
found_fin_ok:
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
break;
} while (1);
out:
@@ -948,16 +1014,29 @@ void dccp_close(struct sock *sk, long timeout)
if (data_was_unread) {
/* Unread data was tossed, send an appropriate Reset Code */
- DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
+ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_set_state(sk, DCCP_CLOSED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
} else if (sk->sk_state != DCCP_CLOSED) {
+ /*
+ * Normal connection termination. May need to wait if there are
+ * still packets in the TX queue that are delayed by the CCID.
+ */
+ dccp_flush_write_queue(sk, &timeout);
dccp_terminate_connection(sk);
}
+ /*
+ * Flush write queue. This may be necessary in several cases:
+ * - we have been closed by the peer but still have application data;
+ * - abortive termination (unread data or zero linger time),
+ * - normal termination but queue could not be flushed within time limit
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
sk_stream_wait_close(sk, timeout);
adjudge_to_death:
@@ -1005,13 +1084,15 @@ EXPORT_SYMBOL_GPL(dccp_shutdown);
static inline int dccp_mib_init(void)
{
- return snmp_mib_init((void __percpu **)dccp_statistics,
- sizeof(struct dccp_mib));
+ dccp_statistics = alloc_percpu(struct dccp_mib);
+ if (!dccp_statistics)
+ return -ENOMEM;
+ return 0;
}
static inline void dccp_mib_exit(void)
{
- snmp_mib_free((void __percpu **)dccp_statistics);
+ free_percpu(dccp_statistics);
}
static int thash_entries;
@@ -1019,7 +1100,7 @@ module_param(thash_entries, int, 0444);
MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
#ifdef CONFIG_IP_DCCP_DEBUG
-int dccp_debug;
+bool dccp_debug;
module_param(dccp_debug, bool, 0644);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
@@ -1078,10 +1159,8 @@ static int __init dccp_init(void)
goto out_free_bind_bucket_cachep;
}
- for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
+ for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
- INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
- }
if (inet_ehash_locks_alloc(&dccp_hashinfo))
goto out_free_dccp_ehash;
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000000..63c30bfa470
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/qpolicy.c
+ *
+ * Policy-based packet dequeueing interface for DCCP.
+ *
+ * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ * Simple Dequeueing Policy:
+ * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+ skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_tx_qlen &&
+ sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+ return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ * Priority-based Dequeueing Policy:
+ * If tx_qlen is different from 0 and the queue has reached its upper bound
+ * of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *best = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (best == NULL || skb->priority > best->priority)
+ best = skb;
+ return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *worst = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (worst == NULL || skb->priority < worst->priority)
+ worst = skb;
+ return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+ if (qpolicy_simple_full(sk))
+ dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+ return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top: peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+ void (*push) (struct sock *sk, struct sk_buff *skb);
+ bool (*full) (struct sock *sk);
+ struct sk_buff* (*top) (struct sock *sk);
+ __be32 params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+ [DCCPQ_POLICY_SIMPLE] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_simple_full,
+ .top = qpolicy_simple_top,
+ .params = 0,
+ },
+ [DCCPQ_POLICY_PRIO] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_prio_full,
+ .top = qpolicy_prio_best_skb,
+ .params = DCCP_SCM_PRIORITY,
+ },
+};
+
+/*
+ * Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+ qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ skb_unlink(skb, &sk->sk_write_queue);
+ kfree_skb(skb);
+ }
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+ struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+ if (skb != NULL) {
+ /* Clear any skb fields that we used internally */
+ skb->priority = 0;
+ skb_unlink(skb, &sk->sk_write_queue);
+ }
+ return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+ /* check if exactly one bit is set */
+ if (!param || (param & (param - 1)))
+ return false;
+ return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 563943822e5..53731e45403 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -20,8 +20,10 @@
/* Boundary values */
static int zero = 0,
+ one = 1,
u8_max = 0xFF;
-static unsigned long seqw_min = 32;
+static unsigned long seqw_min = DCCPF_SEQ_WMIN,
+ seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
static struct ctl_table dccp_default_table[] = {
{
@@ -31,6 +33,7 @@ static struct ctl_table dccp_default_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
+ .extra2 = &seqw_max,
},
{
.procname = "rx_ccid",
@@ -56,7 +59,7 @@ static struct ctl_table dccp_default_table[] = {
.maxlen = sizeof(sysctl_dccp_request_retries),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &u8_max,
},
{
@@ -96,18 +99,11 @@ static struct ctl_table dccp_default_table[] = {
{ }
};
-static struct ctl_path dccp_path[] = {
- { .procname = "net", },
- { .procname = "dccp", },
- { .procname = "default", },
- { }
-};
-
static struct ctl_table_header *dccp_table_header;
int __init dccp_sysctl_init(void)
{
- dccp_table_header = register_sysctl_paths(dccp_path,
+ dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
dccp_default_table);
return dccp_table_header != NULL ? 0 : -ENOMEM;
@@ -116,7 +112,7 @@ int __init dccp_sysctl_init(void)
void dccp_sysctl_exit(void)
{
if (dccp_table_header != NULL) {
- unregister_sysctl_table(dccp_table_header);
+ unregister_net_sysctl_table(dccp_table_header);
dccp_table_header = NULL;
}
}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index bbfeb5eae46..1cd46a345cb 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
#include "dccp.h"
@@ -38,7 +39,7 @@ static int dccp_write_timeout(struct sock *sk)
if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
if (icsk->icsk_retransmits != 0)
- dst_negative_advice(&sk->sk_dst_cache, sk);
+ dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ?
: sysctl_dccp_request_retries;
} else {
@@ -63,7 +64,7 @@ static int dccp_write_timeout(struct sock *sk)
Golden words :-).
*/
- dst_negative_advice(&sk->sk_dst_cache, sk);
+ dst_negative_advice(sk);
}
retry_until = sysctl_dccp_retries2;
@@ -237,32 +238,35 @@ out:
sock_put(sk);
}
-/* Transmit-delay timer: used by the CCIDs to delay actual send time */
-static void dccp_write_xmit_timer(unsigned long data)
+/**
+ * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
{
struct sock *sk = (struct sock *)data;
- struct dccp_sock *dp = dccp_sk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
+ sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
else
- dccp_write_xmit(sk, 0);
+ dccp_write_xmit(sk);
bh_unlock_sock(sk);
- sock_put(sk);
}
-static void dccp_init_write_xmit_timer(struct sock *sk)
+static void dccp_write_xmit_timer(unsigned long data)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
- setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
- (unsigned long)sk);
+ dccp_write_xmitlet(data);
+ sock_put((struct sock *)data);
}
void dccp_init_xmit_timers(struct sock *sk)
{
- dccp_init_write_xmit_timer(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+ setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+ (unsigned long)sk);
inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
&dccp_keepalive_timer);
}
@@ -276,7 +280,7 @@ static ktime_t dccp_timestamp_seed;
*/
u32 dccp_timestamp(void)
{
- s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
+ u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
do_div(delta, 10);
return delta;