aboutsummaryrefslogtreecommitdiff
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Kconfig12
-rw-r--r--net/dccp/Makefile17
-rw-r--r--net/dccp/ackvec.c637
-rw-r--r--net/dccp/ackvec.h212
-rw-r--r--net/dccp/ccid.c236
-rw-r--r--net/dccp/ccid.h140
-rw-r--r--net/dccp/ccids/Kconfig113
-rw-r--r--net/dccp/ccids/Makefile9
-rw-r--r--net/dccp/ccids/ccid2.c807
-rw-r--r--net/dccp/ccids/ccid2.h113
-rw-r--r--net/dccp/ccids/ccid3.c661
-rw-r--r--net/dccp/ccids/ccid3.h161
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c29
-rw-r--r--net/dccp/ccids/lib/loss_interval.h10
-rw-r--r--net/dccp/ccids/lib/packet_history.c158
-rw-r--r--net/dccp/ccids/lib/packet_history.h74
-rw-r--r--net/dccp/ccids/lib/tfrc.c30
-rw-r--r--net/dccp/ccids/lib/tfrc.h43
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c50
-rw-r--r--net/dccp/dccp.h302
-rw-r--r--net/dccp/diag.c31
-rw-r--r--net/dccp/feat.c1830
-rw-r--r--net/dccp/feat.h145
-rw-r--r--net/dccp/input.c209
-rw-r--r--net/dccp/ipv4.c328
-rw-r--r--net/dccp/ipv6.c492
-rw-r--r--net/dccp/ipv6.h2
-rw-r--r--net/dccp/minisocks.c128
-rw-r--r--net/dccp/options.c420
-rw-r--r--net/dccp/output.c379
-rw-r--r--net/dccp/probe.c95
-rw-r--r--net/dccp/proto.c476
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c78
-rw-r--r--net/dccp/timer.c70
36 files changed, 4813 insertions, 3824 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 7aa2a7acc7e..8c0ef71bed2 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -1,7 +1,6 @@
menuconfig IP_DCCP
- tristate "The DCCP Protocol (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
- select IP_DCCP_CCID2
+ tristate "The DCCP Protocol"
+ depends on INET
---help---
Datagram Congestion Control Protocol (RFC 4340)
@@ -25,9 +24,6 @@ config INET_DCCP_DIAG
def_tristate y if (IP_DCCP = y && INET_DIAG = y)
def_tristate m
-config IP_DCCP_ACKVEC
- bool
-
source "net/dccp/ccids/Kconfig"
menu "DCCP Kernel Hacking"
@@ -53,7 +49,9 @@ config NET_DCCPPROBE
what was just said, you don't need it: say N.
Documentation on how to use DCCP connection probing can be found
- at http://linux-net.osdl.org/index.php/DccpProbe
+ at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
To compile this code as a module, choose M here: the
module will be called dccp_probe.
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index f4f8793aaff..5c8362b037e 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,6 +1,17 @@
obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+ qpolicy.o
+#
+# CCID algorithms to be used by dccp.ko
+#
+# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
+dccp-y += ccids/ccid2.o ackvec.o
+dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
+dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
+ ccids/lib/tfrc_equation.o \
+ ccids/lib/packet_history.o \
+ ccids/lib/loss_interval.o
dccp_ipv4-y := ipv4.o
@@ -8,8 +19,6 @@ dccp_ipv4-y := ipv4.o
obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
dccp_ipv6-y := ipv6.o
-dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
-
obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
@@ -17,5 +26,3 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o
dccp_diag-y := diag.o
dccp_probe-y := probe.o
-
-obj-y += ccids/
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 6de4bd195d2..ba07824af4c 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,470 +1,376 @@
/*
* net/dccp/ackvec.c
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License;
*/
-
-#include "ackvec.h"
#include "dccp.h"
-
-#include <linux/dccp.h>
-#include <linux/init.h>
-#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/skbuff.h>
#include <linux/slab.h>
-
-#include <net/sock.h>
+#include <linux/export.h>
static struct kmem_cache *dccp_ackvec_slab;
static struct kmem_cache *dccp_ackvec_record_slab;
-static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
{
- struct dccp_ackvec_record *avr =
- kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+ struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
- if (avr != NULL)
- INIT_LIST_HEAD(&avr->avr_node);
-
- return avr;
+ if (av != NULL) {
+ av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
+ INIT_LIST_HEAD(&av->av_records);
+ }
+ return av;
}
-static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
{
- if (unlikely(avr == NULL))
- return;
- /* Check if deleting a linked record */
- WARN_ON(!list_empty(&avr->avr_node));
- kmem_cache_free(dccp_ackvec_record_slab, avr);
+ struct dccp_ackvec_record *cur, *next;
+
+ list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+ kmem_cache_free(dccp_ackvec_record_slab, cur);
+ INIT_LIST_HEAD(&av->av_records);
}
-static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
+void dccp_ackvec_free(struct dccp_ackvec *av)
{
- /*
- * AVRs are sorted by seqno. Since we are sending them in order, we
- * just add the AVR at the head of the list.
- * -sorbo.
- */
- if (!list_empty(&av->av_records)) {
- const struct dccp_ackvec_record *head =
- list_entry(av->av_records.next,
- struct dccp_ackvec_record,
- avr_node);
- BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
+ if (likely(av != NULL)) {
+ dccp_ackvec_purge_records(av);
+ kmem_cache_free(dccp_ackvec_slab, av);
}
-
- list_add(&avr->avr_node, &av->av_records);
}
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_ackvec_update_records - Record information about sent Ack Vectors
+ * @av: Ack Vector records to update
+ * @seqno: Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- /* Figure out how many options do we need to represent the ackvec */
- const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
- u16 len = av->av_vec_len + 2 * nr_opts, i;
- u32 elapsed_time;
- const unsigned char *tail, *from;
- unsigned char *to;
struct dccp_ackvec_record *avr;
- suseconds_t delta;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- delta = ktime_us_delta(ktime_get_real(), av->av_time);
- elapsed_time = delta / 10;
-
- if (elapsed_time != 0 &&
- dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
- return -1;
- avr = dccp_ackvec_record_new();
+ avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
if (avr == NULL)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- len = av->av_vec_len;
- from = av->av_buf + av->av_buf_head;
- tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
-
- for (i = 0; i < nr_opts; ++i) {
- int copylen = len;
-
- if (len > DCCP_MAX_ACKVEC_OPT_LEN)
- copylen = DCCP_MAX_ACKVEC_OPT_LEN;
-
- *to++ = DCCPO_ACK_VECTOR_0;
- *to++ = copylen + 2;
-
- /* Check if buf_head wraps */
- if (from + copylen > tail) {
- const u16 tailsize = tail - from;
-
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- copylen -= tailsize;
- from = av->av_buf;
- }
-
- memcpy(to, from, copylen);
- from += copylen;
- to += copylen;
- len -= copylen;
- }
+ return -ENOBUFS;
+ avr->avr_ack_seqno = seqno;
+ avr->avr_ack_ptr = av->av_buf_head;
+ avr->avr_ack_ackno = av->av_buf_ackno;
+ avr->avr_ack_nonce = nonce_sum;
+ avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
/*
- * From RFC 4340, A.2:
- *
- * For each acknowledgement it sends, the HC-Receiver will add an
- * acknowledgement record. ack_seqno will equal the HC-Receiver
- * sequence number it used for the ack packet; ack_ptr will equal
- * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
- * equal buf_nonce.
+ * When the buffer overflows, we keep no more than one record. This is
+ * the simplest way of disambiguating sender-Acks dating from before the
+ * overflow from sender-Acks which refer to after the overflow; a simple
+ * solution is preferable here since we are handling an exception.
*/
- avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- avr->avr_ack_ptr = av->av_buf_head;
- avr->avr_ack_ackno = av->av_buf_ackno;
- avr->avr_ack_nonce = av->av_buf_nonce;
- avr->avr_sent_len = av->av_vec_len;
-
- dccp_ackvec_insert_avr(av, avr);
+ if (av->av_overflow)
+ dccp_ackvec_purge_records(av);
+ /*
+ * Since GSS is incremented for each packet, the list is automatically
+ * arranged in descending order of @ack_seqno.
+ */
+ list_add(&avr->avr_node, &av->av_records);
- dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu\n",
- dccp_role(sk), avr->avr_sent_len,
+ dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
(unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno);
+ (unsigned long long)avr->avr_ack_ackno,
+ avr->avr_ack_runlen);
return 0;
}
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+ const u64 ackno)
{
- struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
-
- if (av != NULL) {
- av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1;
- av->av_buf_ackno = UINT48_MAX + 1;
- av->av_buf_nonce = 0;
- av->av_time = ktime_set(0, 0);
- av->av_vec_len = 0;
- INIT_LIST_HEAD(&av->av_records);
+ struct dccp_ackvec_record *avr;
+ /*
+ * Exploit that records are inserted in descending order of sequence
+ * number, start with the oldest record first. If @ackno is `before'
+ * the earliest ack_ackno, the packet is too old to be considered.
+ */
+ list_for_each_entry_reverse(avr, av_list, avr_node) {
+ if (avr->avr_ack_seqno == ackno)
+ return avr;
+ if (before48(ackno, avr->avr_ack_seqno))
+ break;
}
-
- return av;
+ return NULL;
}
-void dccp_ackvec_free(struct dccp_ackvec *av)
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
{
- if (unlikely(av == NULL))
- return;
-
- if (!list_empty(&av->av_records)) {
- struct dccp_ackvec_record *avr, *next;
-
- list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
- list_del_init(&avr->avr_node);
- dccp_ackvec_record_delete(avr);
- }
- }
-
- kmem_cache_free(dccp_ackvec_slab, av);
+ return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
}
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const u32 index)
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
{
- return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
+ return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
}
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const u32 index)
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
{
- return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
+ if (unlikely(av->av_overflow))
+ return DCCPAV_MAX_ACKVEC_LEN;
+ return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
}
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
+/**
+ * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
+ * @av: non-empty buffer to update
+ * @distance: negative or zero distance of @seqno from buf_ackno downward
+ * @seqno: the (old) sequence number whose record is to be updated
+ * @state: state in which packet carrying @seqno was received
*/
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
- const unsigned int packets,
- const unsigned char state)
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+ u64 seqno, enum dccp_ackvec_states state)
{
- unsigned int gap;
- long new_head;
+ u16 ptr = av->av_buf_head;
- if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
- return -ENOBUFS;
+ BUG_ON(distance > 0);
+ if (unlikely(dccp_ackvec_is_empty(av)))
+ return;
- gap = packets - 1;
- new_head = av->av_buf_head - packets;
+ do {
+ u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
- if (new_head < 0) {
- if (gap > 0) {
- memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
- gap + new_head + 1);
- gap = -new_head;
+ if (distance + runlen >= 0) {
+ /*
+ * Only update the state if packet has not been received
+ * yet. This is OK as per the second table in RFC 4340,
+ * 11.4.1; i.e. here we are using the following table:
+ * RECEIVED
+ * 0 1 3
+ * S +---+---+---+
+ * T 0 | 0 | 0 | 0 |
+ * O +---+---+---+
+ * R 1 | 1 | 1 | 1 |
+ * E +---+---+---+
+ * D 3 | 0 | 1 | 3 |
+ * +---+---+---+
+ * The "Not Received" state was set by reserve_seats().
+ */
+ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+ av->av_buf[ptr] = state;
+ else
+ dccp_pr_debug("Not changing %llu state to %u\n",
+ (unsigned long long)seqno, state);
+ break;
}
- new_head += DCCP_MAX_ACKVEC_LEN;
- }
- av->av_buf_head = new_head;
+ distance += runlen + 1;
+ ptr = __ackvec_idx_add(ptr, 1);
- if (gap > 0)
- memset(av->av_buf + av->av_buf_head + 1,
- DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+ } while (ptr != av->av_buf_tail);
+}
- av->av_buf[av->av_buf_head] = state;
- av->av_vec_len += packets;
- return 0;
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+ u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+ len = DCCPAV_MAX_ACKVEC_LEN - start;
+
+ /* check for buffer wrap-around */
+ if (num > len) {
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+ start = 0;
+ num -= len;
+ }
+ if (num)
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
}
-/*
- * Implements the RFC 4340, Appendix A
+/**
+ * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
+ * @av: container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno: sequence number of the first packet in @num_packets
+ * @state: state in which packet carrying @seqno was received
*/
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+ u64 seqno, enum dccp_ackvec_states state)
{
- /*
- * Check at the right places if the buffer is full, if it is, tell the
- * caller to start dropping packets till the HC-Sender acks our ACK
- * vectors, when we will free up space in av_buf.
- *
- * We may well decide to do buffer compression, etc, but for now lets
- * just drop.
- *
- * From Appendix A.1.1 (`New Packets'):
- *
- * Of course, the circular buffer may overflow, either when the
- * HC-Sender is sending data at a very high rate, when the
- * HC-Receiver's acknowledgements are not reaching the HC-Sender,
- * or when the HC-Sender is forgetting to acknowledge those acks
- * (so the HC-Receiver is unable to clean up old state). In this
- * case, the HC-Receiver should either compress the buffer (by
- * increasing run lengths when possible), transfer its state to
- * a larger buffer, or, as a last resort, drop all received
- * packets, without processing them whatsoever, until its buffer
- * shrinks again.
- */
+ u32 num_cells = num_packets;
- /* See if this is the first ackno being inserted */
- if (av->av_vec_len == 0) {
- av->av_buf[av->av_buf_head] = state;
- av->av_vec_len = 1;
- } else if (after48(ackno, av->av_buf_ackno)) {
- const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
+ if (num_packets > DCCPAV_BURST_THRESH) {
+ u32 lost_packets = num_packets - 1;
+ DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
/*
- * Look if the state of this packet is the same as the
- * previous ackno and if so if we can bump the head len.
+ * We received 1 packet and have a loss of size "num_packets-1"
+ * which we squeeze into num_cells-1 rather than reserving an
+ * entire byte for each lost packet.
+ * The reason is that the vector grows in O(burst_length); when
+ * it grows too large there will no room left for the payload.
+ * This is a trade-off: if a few packets out of the burst show
+ * up later, their state will not be changed; it is simply too
+ * costly to reshuffle/reallocate/copy the buffer each time.
+ * Should such problems persist, we will need to switch to a
+ * different underlying data structure.
*/
- if (delta == 1 &&
- dccp_ackvec_state(av, av->av_buf_head) == state &&
- dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
- av->av_buf[av->av_buf_head]++;
- else if (dccp_ackvec_set_buf_head_state(av, delta, state))
- return -ENOBUFS;
- } else {
- /*
- * A.1.2. Old Packets
- *
- * When a packet with Sequence Number S <= buf_ackno
- * arrives, the HC-Receiver will scan the table for
- * the byte corresponding to S. (Indexing structures
- * could reduce the complexity of this scan.)
- */
- u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
- u32 index = av->av_buf_head;
+ for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+ u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
- while (1) {
- const u8 len = dccp_ackvec_len(av, index);
- const u8 state = dccp_ackvec_state(av, index);
- /*
- * valid packets not yet in av_buf have a reserved
- * entry, with a len equal to 0.
- */
- if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
- len == 0 && delta == 0) { /* Found our
- reserved seat! */
- dccp_pr_debug("Found %llu reserved seat!\n",
- (unsigned long long)ackno);
- av->av_buf[index] = state;
- goto out;
- }
- /* len == 0 means one packet */
- if (delta < len + 1)
- goto out_duplicate;
-
- delta -= len + 1;
- if (++index == DCCP_MAX_ACKVEC_LEN)
- index = 0;
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+ av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+ lost_packets -= len;
}
}
- av->av_buf_ackno = ackno;
- av->av_time = ktime_get_real();
-out:
- return 0;
-
-out_duplicate:
- /* Duplicate packet */
- dccp_pr_debug("Received a dup or already considered lost "
- "packet: %llu\n", (unsigned long long)ackno);
- return -EILSEQ;
-}
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
-{
- dccp_pr_debug_cat("ACK vector len=%d, ackno=%llu |", len,
- (unsigned long long)ackno);
+ if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+ DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+ av->av_overflow = true;
+ }
- while (len--) {
- const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+ if (av->av_overflow)
+ av->av_buf_tail = av->av_buf_head;
- dccp_pr_debug_cat("%d,%d|", state, rl);
- ++vector;
- }
+ av->av_buf[av->av_buf_head] = state;
+ av->av_buf_ackno = seqno;
- dccp_pr_debug_cat("\n");
+ if (num_packets > 1)
+ dccp_ackvec_reserve_seats(av, num_packets - 1);
}
-void dccp_ackvec_print(const struct dccp_ackvec *av)
+/**
+ * dccp_ackvec_input - Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
- dccp_ackvector_print(av->av_buf_ackno,
- av->av_buf + av->av_buf_head,
- av->av_vec_len);
-}
-#endif
+ u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ enum dccp_ackvec_states state = DCCPAV_RECEIVED;
-static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
-{
- struct dccp_ackvec_record *next;
+ if (dccp_ackvec_is_empty(av)) {
+ dccp_ackvec_add_new(av, 1, seqno, state);
+ av->av_tail_ackno = seqno;
- /* sort out vector length */
- if (av->av_buf_head <= avr->avr_ack_ptr)
- av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
- else
- av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
- av->av_buf_head + avr->avr_ack_ptr;
+ } else {
+ s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+ u8 *current_head = av->av_buf + av->av_buf_head;
- /* free records */
- list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
- list_del_init(&avr->avr_node);
- dccp_ackvec_record_delete(avr);
- }
-}
+ if (num_packets == 1 &&
+ dccp_ackvec_state(current_head) == state &&
+ dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
+ *current_head += 1;
+ av->av_buf_ackno = seqno;
- /*
- * If we traverse backwards, it should be faster when we have large
- * windows. We will be receiving ACKs for stuff we sent a while back
- * -sorbo.
- */
- list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
- if (ackno == avr->avr_ack_seqno) {
- dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu, ACKED!\n",
- dccp_role(sk), 1,
- (unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- } else if (avr->avr_ack_seqno > ackno)
- break; /* old news */
+ } else if (num_packets > 0) {
+ dccp_ackvec_add_new(av, num_packets, seqno, state);
+ } else {
+ dccp_ackvec_update_old(av, num_packets, seqno, state);
+ }
}
}
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
- struct sock *sk, u64 *ackno,
- const unsigned char len,
- const unsigned char *vector)
+/**
+ * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
{
- unsigned char i;
- struct dccp_ackvec_record *avr;
+ struct dccp_ackvec_record *avr, *next;
+ u8 runlen_now, eff_runlen;
+ s64 delta;
- /* Check if we actually sent an ACK vector */
- if (list_empty(&av->av_records))
+ avr = dccp_ackvec_lookup(&av->av_records, ackno);
+ if (avr == NULL)
return;
+ /*
+ * Deal with outdated acknowledgments: this arises when e.g. there are
+ * several old records and the acks from the peer come in slowly. In
+ * that case we may still have records that pre-date tail_ackno.
+ */
+ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+ if (delta < 0)
+ goto free_records;
+ /*
+ * Deal with overlapping Ack Vectors: don't subtract more than the
+ * number of packets between tail_ackno and ack_ackno.
+ */
+ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
- i = len;
+ runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
/*
- * XXX
- * I think it might be more efficient to work backwards. See comment on
- * rcv_ackno. -sorbo.
+ * The run length of Ack Vector cells does not decrease over time. If
+ * the run length is the same as at the time the Ack Vector was sent, we
+ * free the ack_ptr cell. That cell can however not be freed if the run
+ * length has increased: in this case we need to move the tail pointer
+ * backwards (towards higher indices), to its next-oldest neighbour.
*/
- avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
- while (i--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl;
+ if (runlen_now > eff_runlen) {
- dccp_set_seqno(&ackno_end_rl, *ackno - rl);
+ av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+ av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+ /* This move may not have cleared the overflow flag. */
+ if (av->av_overflow)
+ av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+ } else {
+ av->av_buf_tail = avr->avr_ack_ptr;
/*
- * If our AVR sequence number is greater than the ack, go
- * forward in the AVR list until it is not so.
+ * We have made sure that avr points to a valid cell within the
+ * buffer. This cell is either older than head, or equals head
+ * (empty buffer): in both cases we no longer have any overflow.
*/
- list_for_each_entry_from(avr, &av->av_records, avr_node) {
- if (!after48(avr->avr_ack_seqno, *ackno))
- goto found;
- }
- /* End of the av_records list, not found, exit */
- break;
-found:
- if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
- const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
- dccp_pr_debug("%s ACK vector 0, len=%d, "
- "ack_seqno=%llu, ack_ackno=%llu, "
- "ACKED!\n",
- dccp_role(sk), len,
- (unsigned long long)
- avr->avr_ack_seqno,
- (unsigned long long)
- avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- }
- /*
- * If it wasn't received, continue scanning... we might
- * find another one.
- */
- }
+ av->av_overflow = 0;
+ }
+
+ /*
+ * The peer has acknowledged up to and including ack_ackno. Hence the
+ * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+ */
+ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
- dccp_set_seqno(ackno, ackno_end_rl - 1);
- ++vector;
+free_records:
+ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+ list_del(&avr->avr_node);
+ kmem_cache_free(dccp_ackvec_record_slab, avr);
}
}
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt, const u8 *value, const u8 len)
+/*
+ * Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
{
- if (len > DCCP_MAX_ACKVEC_OPT_LEN)
- return -1;
+ struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+ if (new == NULL)
+ return -ENOBUFS;
+ new->vec = vec;
+ new->len = len;
+ new->nonce = nonce;
- /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
- dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
- ackno, len, value);
+ list_add_tail(&new->node, head);
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+ struct dccp_ackvec_parsed *cur, *next;
+
+ list_for_each_entry_safe(cur, next, parsed_chunks, node)
+ kfree(cur);
+ INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
int __init dccp_ackvec_init(void)
{
@@ -474,10 +380,9 @@ int __init dccp_ackvec_init(void)
if (dccp_ackvec_slab == NULL)
goto out_err;
- dccp_ackvec_record_slab =
- kmem_cache_create("dccp_ackvec_record",
- sizeof(struct dccp_ackvec_record),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+ sizeof(struct dccp_ackvec_record),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (dccp_ackvec_record_slab == NULL)
goto out_destroy_slab;
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index bcb64fb4ace..3284bfa988c 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,156 +3,136 @@
/*
* net/dccp/ackvec.h
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#include <linux/dccp.h>
#include <linux/compiler.h>
-#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/types.h>
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_OPT_LEN 253
-/* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
+ */
+#define DCCPAV_NUM_ACKVECS 2
+#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
-#define DCCP_ACKVEC_STATE_RECEIVED 0
-#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN 16
-#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
-/** struct dccp_ackvec - ack vector
- *
- * This data structure is the one defined in RFC 4340, Appendix A.
- *
- * @av_buf_head - circular buffer head
- * @av_buf_tail - circular buffer tail
- * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
- * buffer (i.e. %av_buf_head)
- * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * by the buffer with State 0
- *
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
+enum dccp_ackvec_states {
+ DCCPAV_RECEIVED = 0x00,
+ DCCPAV_ECN_MARKED = 0x40,
+ DCCPAV_RESERVED = 0x80,
+ DCCPAV_NOT_RECEIVED = 0xC0
+};
+#define DCCPAV_MAX_RUNLEN 0x3F
+
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+ return *cell & DCCPAV_MAX_RUNLEN;
+}
+
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+ return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/**
+ * struct dccp_ackvec - Ack Vector main data structure
*
- * @av_records - list of dccp_ackvec_record
- * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
*
- * @av_time - the time in usecs
- * @av_buf - circular buffer of acknowledgeable packets
+ * @av_buf: circular buffer storage area
+ * @av_buf_head: head index; begin of live portion in @av_buf
+ * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
+ * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
+ * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
*/
struct dccp_ackvec {
- u64 av_buf_ackno;
- struct list_head av_records;
- ktime_t av_time;
+ u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
u16 av_buf_head;
- u16 av_vec_len;
- u8 av_buf_nonce;
- u8 av_ack_nonce;
- u8 av_buf[DCCP_MAX_ACKVEC_LEN];
+ u16 av_buf_tail;
+ u64 av_buf_ackno:48;
+ u64 av_tail_ackno:48;
+ bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
+ u8 av_overflow:1;
+ struct list_head av_records;
};
-/** struct dccp_ackvec_record - ack vector record
+/**
+ * struct dccp_ackvec_record - Records information about sent Ack Vectors
*
- * ACK vector record as defined in Appendix A of spec.
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
*
- * The list is sorted by avr_ack_seqno
+ * @avr_node: the list node in @av_records
+ * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr: pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
*
- * @avr_node - node in av_records
- * @avr_ack_seqno - sequence number of the packet this record was sent on
- * @avr_ack_ackno - sequence number being acknowledged
- * @avr_ack_ptr - pointer into av_buf where this record starts
- * @avr_ack_nonce - av_ack_nonce at the time this record was sent
- * @avr_sent_len - lenght of the record in av_buf
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
*/
struct dccp_ackvec_record {
struct list_head avr_node;
- u64 avr_ack_seqno;
- u64 avr_ack_ackno;
+ u64 avr_ack_seqno:48;
+ u64 avr_ack_ackno:48;
u16 avr_ack_ptr;
- u16 avr_sent_len;
- u8 avr_ack_nonce;
+ u8 avr_ack_runlen;
+ u8 avr_ack_nonce:1;
};
-struct sock;
-struct sk_buff;
-
-#ifdef CONFIG_IP_DCCP_ACKVEC
-extern int dccp_ackvec_init(void);
-extern void dccp_ackvec_exit(void);
-
-extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
-extern void dccp_ackvec_free(struct dccp_ackvec *av);
-
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state);
-
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt,
- const u8 *value, const u8 len);
-
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
-
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return av->av_vec_len;
-}
-#else /* CONFIG_IP_DCCP_ACKVEC */
-static inline int dccp_ackvec_init(void)
-{
- return 0;
-}
-
-static inline void dccp_ackvec_exit(void)
-{
-}
+int dccp_ackvec_init(void);
+void dccp_ackvec_exit(void);
-static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
- return NULL;
-}
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
+void dccp_ackvec_free(struct dccp_ackvec *av);
-static inline void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-}
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
-static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
- return -1;
+ return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
-static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno)
-{
-}
-
-static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- const u64 *ackno, const u8 opt,
- const u8 *value, const u8 len)
-{
- return -1;
-}
-
-static inline int dccp_insert_option_ackvec(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return -1;
-}
+/**
+ * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
+ * @vec: start of vector (offset into skb)
+ * @len: length of @vec
+ * @nonce: whether @vec had an ECN nonce of 0 or 1
+ * @node: FIFO - arranged in descending order of ack_ackno
+ *
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+ u8 *vec,
+ len,
+ nonce:1;
+ struct list_head node;
+};
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return 0;
-}
-#endif /* CONFIG_IP_DCCP_ACKVEC */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 4809753d12a..597557254dd 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -11,87 +11,101 @@
* published by the Free Software Foundation.
*/
+#include <linux/slab.h>
+
#include "ccid.h"
+#include "ccids/lib/tfrc.h"
-static struct ccid_operations *ccids[CCID_MAX];
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-static atomic_t ccids_lockct = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(ccids_lock);
+static struct ccid_operations *ccids[] = {
+ &ccid2_ops,
+#ifdef CONFIG_IP_DCCP_CCID3
+ &ccid3_ops,
+#endif
+};
-/*
- * The strategy is: modifications ccids vector are short, do not sleep and
- * veeery rare, but read access should be free of any exclusive locks.
- */
-static void ccids_write_lock(void)
+static struct ccid_operations *ccid_by_number(const u8 id)
{
- spin_lock(&ccids_lock);
- while (atomic_read(&ccids_lockct) != 0) {
- spin_unlock(&ccids_lock);
- yield();
- spin_lock(&ccids_lock);
- }
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ if (ccids[i]->ccid_id == id)
+ return ccids[i];
+ return NULL;
}
-static inline void ccids_write_unlock(void)
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
{
- spin_unlock(&ccids_lock);
+ while (array_len > 0)
+ if (ccid_by_number(ccid_array[--array_len]) == NULL)
+ return false;
+ return true;
}
-static inline void ccids_read_lock(void)
+/**
+ * ccid_get_builtin_ccids - Populate a list of built-in CCIDs
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ *
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
{
- atomic_inc(&ccids_lockct);
- smp_mb__after_atomic_inc();
- spin_unlock_wait(&ccids_lock);
+ *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
+ if (*ccid_array == NULL)
+ return -ENOBUFS;
+
+ for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
+ (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
+ return 0;
}
-static inline void ccids_read_unlock(void)
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
{
- atomic_dec(&ccids_lockct);
-}
+ u8 *ccid_array, array_len;
+ int err = 0;
-#else
-#define ccids_write_lock() do { } while(0)
-#define ccids_write_unlock() do { } while(0)
-#define ccids_read_lock() do { } while(0)
-#define ccids_read_unlock() do { } while(0)
-#endif
+ if (ccid_get_builtin_ccids(&ccid_array, &array_len))
+ return -ENOBUFS;
+
+ if (put_user(array_len, optlen))
+ err = -EFAULT;
+ else if (len > 0 && copy_to_user(optval, ccid_array,
+ len > array_len ? array_len : len))
+ err = -EFAULT;
-static struct kmem_cache *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
+ kfree(ccid_array);
+ return err;
+}
+
+static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
{
struct kmem_cache *slab;
- char slab_name_fmt[32], *slab_name;
va_list args;
va_start(args, fmt);
- vsnprintf(slab_name_fmt, sizeof(slab_name_fmt), fmt, args);
+ vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
va_end(args);
- slab_name = kstrdup(slab_name_fmt, GFP_KERNEL);
- if (slab_name == NULL)
- return NULL;
- slab = kmem_cache_create(slab_name, sizeof(struct ccid) + obj_size, 0,
+ slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
- if (slab == NULL)
- kfree(slab_name);
return slab;
}
static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
{
- if (slab != NULL) {
- const char *name = kmem_cache_name(slab);
-
+ if (slab != NULL)
kmem_cache_destroy(slab);
- kfree(name);
- }
}
-int ccid_register(struct ccid_operations *ccid_ops)
+static int ccid_activate(struct ccid_operations *ccid_ops)
{
int err = -ENOBUFS;
ccid_ops->ccid_hc_rx_slab =
ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
+ ccid_ops->ccid_hc_rx_slab_name,
"ccid%u_hc_rx_sock",
ccid_ops->ccid_id);
if (ccid_ops->ccid_hc_rx_slab == NULL)
@@ -99,84 +113,46 @@ int ccid_register(struct ccid_operations *ccid_ops)
ccid_ops->ccid_hc_tx_slab =
ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
+ ccid_ops->ccid_hc_tx_slab_name,
"ccid%u_hc_tx_sock",
ccid_ops->ccid_id);
if (ccid_ops->ccid_hc_tx_slab == NULL)
goto out_free_rx_slab;
- ccids_write_lock();
- err = -EEXIST;
- if (ccids[ccid_ops->ccid_id] == NULL) {
- ccids[ccid_ops->ccid_id] = ccid_ops;
- err = 0;
- }
- ccids_write_unlock();
- if (err != 0)
- goto out_free_tx_slab;
-
- pr_info("CCID: Registered CCID %d (%s)\n",
+ pr_info("DCCP: Activated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
+ err = 0;
out:
return err;
-out_free_tx_slab:
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
- ccid_ops->ccid_hc_tx_slab = NULL;
- goto out;
out_free_rx_slab:
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
ccid_ops->ccid_hc_rx_slab = NULL;
goto out;
}
-EXPORT_SYMBOL_GPL(ccid_register);
-
-int ccid_unregister(struct ccid_operations *ccid_ops)
+static void ccid_deactivate(struct ccid_operations *ccid_ops)
{
- ccids_write_lock();
- ccids[ccid_ops->ccid_id] = NULL;
- ccids_write_unlock();
-
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
ccid_ops->ccid_hc_tx_slab = NULL;
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
ccid_ops->ccid_hc_rx_slab = NULL;
- pr_info("CCID: Unregistered CCID %d (%s)\n",
+ pr_info("DCCP: Deactivated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
- return 0;
}
-EXPORT_SYMBOL_GPL(ccid_unregister);
-
-struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
{
- struct ccid_operations *ccid_ops;
+ struct ccid_operations *ccid_ops = ccid_by_number(id);
struct ccid *ccid = NULL;
- ccids_read_lock();
-#ifdef CONFIG_KMOD
- if (ccids[id] == NULL) {
- /* We only try to load if in process context */
- ccids_read_unlock();
- if (gfp & GFP_ATOMIC)
- goto out;
- request_module("net-dccp-ccid-%d", id);
- ccids_read_lock();
- }
-#endif
- ccid_ops = ccids[id];
if (ccid_ops == NULL)
- goto out_unlock;
-
- if (!try_module_get(ccid_ops->ccid_owner))
- goto out_unlock;
-
- ccids_read_unlock();
+ goto out;
ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, gfp);
+ ccid_ops->ccid_hc_tx_slab, gfp_any());
if (ccid == NULL)
- goto out_module_put;
+ goto out;
ccid->ccid_ops = ccid_ops;
if (rx) {
memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
@@ -191,67 +167,57 @@ struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
}
out:
return ccid;
-out_unlock:
- ccids_read_unlock();
- goto out;
out_free_ccid:
kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
ccid_ops->ccid_hc_tx_slab, ccid);
ccid = NULL;
-out_module_put:
- module_put(ccid_ops->ccid_owner);
goto out;
}
-EXPORT_SYMBOL_GPL(ccid_new);
-
-struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
{
- return ccid_new(id, sk, 1, gfp);
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_rx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
+ }
}
-EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
-
-struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp)
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
{
- return ccid_new(id, sk, 0, gfp);
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_tx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
+ }
}
-EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
-
-static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
+int __init ccid_initialize_builtins(void)
{
- struct ccid_operations *ccid_ops;
+ int i, err = tfrc_lib_init();
- if (ccid == NULL)
- return;
+ if (err)
+ return err;
- ccid_ops = ccid->ccid_ops;
- if (rx) {
- if (ccid_ops->ccid_hc_rx_exit != NULL)
- ccid_ops->ccid_hc_rx_exit(sk);
- kmem_cache_free(ccid_ops->ccid_hc_rx_slab, ccid);
- } else {
- if (ccid_ops->ccid_hc_tx_exit != NULL)
- ccid_ops->ccid_hc_tx_exit(sk);
- kmem_cache_free(ccid_ops->ccid_hc_tx_slab, ccid);
+ for (i = 0; i < ARRAY_SIZE(ccids); i++) {
+ err = ccid_activate(ccids[i]);
+ if (err)
+ goto unwind_registrations;
}
- ccids_read_lock();
- if (ccids[ccid_ops->ccid_id] != NULL)
- module_put(ccid_ops->ccid_owner);
- ccids_read_unlock();
-}
+ return 0;
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
-{
- ccid_delete(ccid, sk, 1);
+unwind_registrations:
+ while(--i >= 0)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+ return err;
}
-EXPORT_SYMBOL_GPL(ccid_hc_rx_delete);
-
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
+void ccid_cleanup_builtins(void)
{
- ccid_delete(ccid, sk, 0);
-}
+ int i;
-EXPORT_SYMBOL_GPL(ccid_hc_tx_delete);
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index fdeae7b5731..6eb837a47b5 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -19,7 +19,9 @@
#include <linux/list.h>
#include <linux/module.h>
-#define CCID_MAX 255
+/* maximum value for a CCID (RFC 4340, 19.5) */
+#define CCID_MAX 255
+#define CCID_SLAB_NAME_LENGTH 32
struct tcp_info;
@@ -29,7 +31,6 @@ struct tcp_info;
* @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
* @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
* @ccid_name: alphabetical identifier string for @ccid_id
- * @ccid_owner: module which implements/owns this CCID
* @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
* @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
*
@@ -48,9 +49,10 @@ struct ccid_operations {
unsigned char ccid_id;
__u32 ccid_ccmps;
const char *ccid_name;
- struct module *ccid_owner;
struct kmem_cache *ccid_hc_rx_slab,
*ccid_hc_tx_slab;
+ char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
+ char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
__u32 ccid_hc_rx_obj_size,
ccid_hc_tx_obj_size;
/* Interface Routines */
@@ -60,22 +62,18 @@ struct ccid_operations {
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- int more, unsigned int len);
+ unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -90,8 +88,13 @@ struct ccid_operations {
int __user *optlen);
};
-extern int ccid_register(struct ccid_operations *ccid_ops);
-extern int ccid_unregister(struct ccid_operations *ccid_ops);
+extern struct ccid_operations ccid2_ops;
+#ifdef CONFIG_IP_DCCP_CCID3
+extern struct ccid_operations ccid3_ops;
+#endif
+
+int ccid_initialize_builtins(void);
+void ccid_cleanup_builtins(void);
struct ccid {
struct ccid_operations *ccid_ops;
@@ -103,31 +106,76 @@ static inline void *ccid_priv(const struct ccid *ccid)
return (void *)ccid->ccid_priv;
}
-extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
- gfp_t gfp);
+bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *, int __user *);
+
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
+
+static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
+{
+ struct ccid *ccid = dp->dccps_hc_rx_ccid;
-extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
- gfp_t gfp);
-extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
- gfp_t gfp);
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
+}
+
+static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
+{
+ struct ccid *ccid = dp->dccps_hc_tx_ccid;
-extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
-extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
+}
+
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+ CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
+ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
+ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
+ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
+ CCID_PACKET_ERR = 0xF0000, /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+ if (return_code < 0)
+ return CCID_PACKET_ERR;
+ if (return_code == 0)
+ return CCID_PACKET_SEND_AT_ONCE;
+ if (return_code <= CCID_PACKET_DELAY_MAX)
+ return CCID_PACKET_DELAY;
+ return return_code;
+}
static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
- int rc = 0;
if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
- return rc;
+ return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+ return CCID_PACKET_SEND_AT_ONCE;
}
static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- int more, unsigned int len)
+ unsigned int len)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
+ ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -144,27 +192,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
+/**
+ * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
- value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
+/**
+ * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
@@ -194,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
@@ -205,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 12275943eab..8ba3fc9d6d1 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,119 +1,54 @@
-menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
- depends on EXPERIMENTAL
-
-config IP_DCCP_CCID2
- tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
- def_tristate IP_DCCP
- select IP_DCCP_ACKVEC
- ---help---
- CCID 2, TCP-like Congestion Control, denotes Additive Increase,
- Multiplicative Decrease (AIMD) congestion control with behavior
- modelled directly on TCP, including congestion window, slow start,
- timeouts, and so forth [RFC 2581]. CCID 2 achieves maximum
- bandwidth over the long term, consistent with the use of end-to-end
- congestion control, but halves its congestion window in response to
- each congestion event. This leads to the abrupt rate changes
- typical of TCP. Applications should use CCID 2 if they prefer
- maximum bandwidth utilization to steadiness of rate. This is often
- the case for applications that are not playing their data directly
- to the user. For example, a hypothetical application that
- transferred files over DCCP, using application-level retransmissions
- for lost packets, would prefer CCID 2 to CCID 3. On-line games may
- also prefer CCID 2. See RFC 4341 for further details.
-
- CCID2 is the default CCID used by DCCP.
+menu "DCCP CCIDs Configuration"
config IP_DCCP_CCID2_DEBUG
- bool "CCID2 debugging messages"
- depends on IP_DCCP_CCID2
- ---help---
- Enable CCID2-specific debugging messages.
+ bool "CCID-2 debugging messages"
+ ---help---
+ Enable CCID-2 specific debugging messages.
- When compiling CCID2 as a module, this debugging output can
- additionally be toggled by setting the ccid2_debug module
- parameter to 0 or 1.
+ The debugging output can additionally be toggled by setting the
+ ccid2_debug parameter to 0 or 1.
- If in doubt, say N.
+ If in doubt, say N.
config IP_DCCP_CCID3
- tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
- def_tristate IP_DCCP
- select IP_DCCP_TFRC_LIB
+ bool "CCID-3 (TCP-Friendly)"
+ def_bool y if (IP_DCCP = y || IP_DCCP = m)
---help---
- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+ CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
rate-controlled congestion control mechanism. TFRC is designed to
be reasonably fair when competing for bandwidth with TCP-like flows,
where a flow is "reasonably fair" if its sending rate is generally
within a factor of two of the sending rate of a TCP flow under the
same conditions. However, TFRC has a much lower variation of
- throughput over time compared with TCP, which makes CCID 3 more
- suitable than CCID 2 for applications such streaming media where a
+ throughput over time compared with TCP, which makes CCID-3 more
+ suitable than CCID-2 for applications such streaming media where a
relatively smooth sending rate is of importance.
- CCID 3 is further described in RFC 4342,
+ CCID-3 is further described in RFC 4342,
http://www.ietf.org/rfc/rfc4342.txt
The TFRC congestion control algorithms were initially described in
- RFC 3448.
+ RFC 5348.
This text was extracted from RFC 4340 (sec. 10.2),
http://www.ietf.org/rfc/rfc4340.txt
-
- To compile this CCID as a module, choose M here: the module will be
- called dccp_ccid3.
- If in doubt, say M.
+ If in doubt, say N.
config IP_DCCP_CCID3_DEBUG
- bool "CCID3 debugging messages"
- depends on IP_DCCP_CCID3
- ---help---
- Enable CCID3-specific debugging messages.
-
- When compiling CCID3 as a module, this debugging output can
- additionally be toggled by setting the ccid3_debug module
- parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3_RTO
- int "Use higher bound for nofeedback timer"
- default 100
- depends on IP_DCCP_CCID3 && EXPERIMENTAL
- ---help---
- Use higher lower bound for nofeedback timer expiration.
-
- The TFRC nofeedback timer normally expires after the maximum of 4
- RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
- with a small RTT this can mean a high processing load and reduced
- performance, since then the nofeedback timer is triggered very
- frequently.
-
- This option enables to set a higher lower bound for the nofeedback
- value. Values in units of milliseconds can be set here.
-
- A value of 0 disables this feature by enforcing the value specified
- in RFC 3448. The following values have been suggested as bounds for
- experimental use:
- * 16-20ms to match the typical multimedia inter-frame interval
- * 100ms as a reasonable compromise [default]
- * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
+ bool "CCID-3 debugging messages"
+ depends on IP_DCCP_CCID3
+ ---help---
+ Enable CCID-3 specific debugging messages.
- The default of 100ms is a compromise between a large value for
- efficient DCCP implementations, and a small value to avoid disrupting
- the network in times of congestion.
+ The debugging output can additionally be toggled by setting the
+ ccid3_debug parameter to 0 or 1.
- The purpose of the nofeedback timer is to slow DCCP down when there
- is serious network congestion: experimenting with larger values should
- therefore not be performed on WANs.
+ If in doubt, say N.
config IP_DCCP_TFRC_LIB
- tristate
- default n
+ def_bool y if IP_DCCP_CCID3
config IP_DCCP_TFRC_DEBUG
- bool
- depends on IP_DCCP_TFRC_LIB
- default y if IP_DCCP_CCID3_DEBUG
-
+ def_bool y if IP_DCCP_CCID3_DEBUG
endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
deleted file mode 100644
index 438f20bccff..00000000000
--- a/net/dccp/ccids/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
-
-dccp_ccid3-y := ccid3.o
-
-obj-$(CONFIG_IP_DCCP_CCID2) += dccp_ccid2.o
-
-dccp_ccid2-y := ccid2.o
-
-obj-y += lib/
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index b5b52ebb269..f053198e730 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/ccid2.c
- *
* Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* Changes to meet Linux coding standards, and DCCP infrastructure fixes.
@@ -25,70 +23,26 @@
/*
* This implementation should follow RFC 4341
*/
-
-#include "../ccid.h"
-#include "../dccp.h"
+#include <linux/slab.h>
+#include "../feat.h"
#include "ccid2.h"
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-static int ccid2_debug;
+static bool ccid2_debug;
#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
-{
- int len = 0;
- int pipe = 0;
- struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
-
- /* there is data in the chain */
- if (seqp != hctx->ccid2hctx_seqt) {
- seqp = seqp->ccid2s_prev;
- len++;
- if (!seqp->ccid2s_acked)
- pipe++;
-
- while (seqp != hctx->ccid2hctx_seqt) {
- struct ccid2_seq *prev = seqp->ccid2s_prev;
-
- len++;
- if (!prev->ccid2s_acked)
- pipe++;
-
- /* packets are sent sequentially */
- BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
- prev->ccid2s_seq ) >= 0);
- BUG_ON(time_before(seqp->ccid2s_sent,
- prev->ccid2s_sent));
-
- seqp = prev;
- }
- }
-
- BUG_ON(pipe != hctx->ccid2hctx_pipe);
- ccid2_pr_debug("len of chain=%d\n", len);
-
- do {
- seqp = seqp->ccid2s_prev;
- len++;
- } while (seqp != hctx->ccid2hctx_seqh);
-
- ccid2_pr_debug("total len=%d\n", len);
- BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
-}
#else
#define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hctx)
#endif
-static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
+static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
{
struct ccid2_seq *seqp;
int i;
/* check if we have space to preserve the pointer to the buffer */
- if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
- sizeof(struct ccid2_seq*)))
+ if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
+ sizeof(struct ccid2_seq *)))
return -ENOMEM;
/* allocate buffer and initialize linked list */
@@ -104,38 +58,34 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
/* This is the first allocation. Initiate the head and tail. */
- if (hctx->ccid2hctx_seqbufc == 0)
- hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
+ if (hc->tx_seqbufc == 0)
+ hc->tx_seqh = hc->tx_seqt = seqp;
else {
/* link the existing list with the one we just created */
- hctx->ccid2hctx_seqh->ccid2s_next = seqp;
- seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
+ hc->tx_seqh->ccid2s_next = seqp;
+ seqp->ccid2s_prev = hc->tx_seqh;
- hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
- seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
+ hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+ seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
}
/* store the original pointer to the buffer so we can free it */
- hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
- hctx->ccid2hctx_seqbufc++;
+ hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
+ hc->tx_seqbufc++;
return 0;
}
static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
- return 0;
-
- return 1; /* XXX CCID should dequeue when ready instead of polling */
+ if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+ return CCID_PACKET_WILL_DEQUEUE_LATER;
+ return CCID_PACKET_SEND_AT_ONCE;
}
static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
{
- struct dccp_sock *dp = dccp_sk(sk);
- u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
+ u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
/*
* Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -147,109 +97,186 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
val = max_ratio;
}
- if (val > 0xFFFF) /* RFC 4340, 11.3 */
- val = 0xFFFF;
+ dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
+ min_t(u32, val, DCCPF_ACK_RATIO_MAX));
+}
- if (val == dp->dccps_l_ack_ratio)
- return;
+static void ccid2_check_l_ack_ratio(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- ccid2_pr_debug("changing local ack ratio to %u\n", val);
- dp->dccps_l_ack_ratio = val;
+ /*
+ * After a loss, idle period, application limited period, or RTO we
+ * need to check that the ack ratio is still less than the congestion
+ * window. Otherwise, we will send an entire congestion window of
+ * packets and got no response because we haven't sent ack ratio
+ * packets yet.
+ * If the ack ratio does need to be reduced, we reduce it to half of
+ * the congestion window (or 1 if that's zero) instead of to the
+ * congestion window. This prevents problems if one ack is lost.
+ */
+ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
+ ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
}
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
+static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
{
- ccid2_pr_debug("change SRTT to %ld\n", val);
- hctx->ccid2hctx_srtt = val;
+ dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
+ clamp_val(val, DCCPF_SEQ_WMIN,
+ DCCPF_SEQ_WMAX));
}
-static void ccid2_start_rto_timer(struct sock *sk);
-
static void ccid2_hc_tx_rto_expire(unsigned long data)
{
struct sock *sk = (struct sock *)data;
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
- long s;
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
- sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
- jiffies + HZ / 5);
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
goto out;
}
ccid2_pr_debug("RTO_EXPIRE\n");
- ccid2_hc_tx_check_sanity(hctx);
-
/* back-off timer */
- hctx->ccid2hctx_rto <<= 1;
-
- s = hctx->ccid2hctx_rto / HZ;
- if (s > 60)
- hctx->ccid2hctx_rto = 60 * HZ;
-
- ccid2_start_rto_timer(sk);
+ hc->tx_rto <<= 1;
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
/* adjust pipe, cwnd etc */
- hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
- if (hctx->ccid2hctx_ssthresh < 2)
- hctx->ccid2hctx_ssthresh = 2;
- hctx->ccid2hctx_cwnd = 1;
- hctx->ccid2hctx_pipe = 0;
+ hc->tx_ssthresh = hc->tx_cwnd / 2;
+ if (hc->tx_ssthresh < 2)
+ hc->tx_ssthresh = 2;
+ hc->tx_cwnd = 1;
+ hc->tx_pipe = 0;
/* clear state about stuff we sent */
- hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
- hctx->ccid2hctx_packets_acked = 0;
+ hc->tx_seqt = hc->tx_seqh;
+ hc->tx_packets_acked = 0;
/* clear ack ratio state. */
- hctx->ccid2hctx_rpseq = 0;
- hctx->ccid2hctx_rpdupack = -1;
+ hc->tx_rpseq = 0;
+ hc->tx_rpdupack = -1;
ccid2_change_l_ack_ratio(sk, 1);
- ccid2_hc_tx_check_sanity(hctx);
+
+ /* if we were blocked before, we may now send cwnd=1 packet */
+ if (sender_was_blocked)
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ /* restart backed-off timer */
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-static void ccid2_start_rto_timer(struct sock *sk)
+/*
+ * Congestion window validation (RFC 2861).
+ */
+static bool ccid2_do_cwv = true;
+module_param(ccid2_do_cwv, bool, 0644);
+MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
+
+/**
+ * ccid2_update_used_window - Track how much of cwnd is actually used
+ * This is done in addition to CWV. The sender needs to have an idea of how many
+ * packets may be in flight, to set the local Sequence Window value accordingly
+ * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
+ * maximum-used window. We use an EWMA low-pass filter to filter out noise.
+ */
+static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
+}
- ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
+/* This borrows the code of tcp_cwnd_application_limited() */
+static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /* don't reduce cwnd below the initial window (IW) */
+ u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
+ win_used = max(hc->tx_cwnd_used, init_win);
+
+ if (win_used < hc->tx_cwnd) {
+ hc->tx_ssthresh = max(hc->tx_ssthresh,
+ (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
+ hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
+ }
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
- BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
- sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
- jiffies + hctx->ccid2hctx_rto);
+ ccid2_check_l_ack_ratio(sk);
}
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
+/* This borrows the code of tcp_cwnd_restart() */
+static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ u32 cwnd = hc->tx_cwnd, restart_cwnd,
+ iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
+
+ hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
+
+ /* don't reduce cwnd below the initial window (IW) */
+ restart_cwnd = min(cwnd, iwnd);
+ cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
+ hc->tx_cwnd = max(cwnd, restart_cwnd);
+
+ hc->tx_cwnd_stamp = now;
+ hc->tx_cwnd_used = 0;
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const u32 now = ccid2_time_stamp;
struct ccid2_seq *next;
- hctx->ccid2hctx_pipe++;
+ /* slow-start after idle periods (RFC 2581, RFC 2861) */
+ if (ccid2_do_cwv && !hc->tx_pipe &&
+ (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
+ ccid2_cwnd_restart(sk, now);
- hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss;
- hctx->ccid2hctx_seqh->ccid2s_acked = 0;
- hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
+ hc->tx_lsndtime = now;
+ hc->tx_pipe += 1;
- next = hctx->ccid2hctx_seqh->ccid2s_next;
+ /* see whether cwnd was fully used (RFC 2861), update expected window */
+ if (ccid2_cwnd_network_limited(hc)) {
+ ccid2_update_used_window(hc, hc->tx_cwnd);
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+ } else {
+ if (hc->tx_pipe > hc->tx_cwnd_used)
+ hc->tx_cwnd_used = hc->tx_pipe;
+
+ ccid2_update_used_window(hc, hc->tx_cwnd_used);
+
+ if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
+ ccid2_cwnd_application_limited(sk, now);
+ }
+
+ hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
+ hc->tx_seqh->ccid2s_acked = 0;
+ hc->tx_seqh->ccid2s_sent = now;
+
+ next = hc->tx_seqh->ccid2s_next;
/* check if we need to alloc more space */
- if (next == hctx->ccid2hctx_seqt) {
- if (ccid2_hc_tx_alloc_seq(hctx)) {
+ if (next == hc->tx_seqt) {
+ if (ccid2_hc_tx_alloc_seq(hc)) {
DCCP_CRIT("packet history - out of memory!");
/* FIXME: find a more graceful way to bail out */
return;
}
- next = hctx->ccid2hctx_seqh->ccid2s_next;
- BUG_ON(next == hctx->ccid2hctx_seqt);
+ next = hc->tx_seqh->ccid2s_next;
+ BUG_ON(next == hc->tx_seqt);
}
- hctx->ccid2hctx_seqh = next;
+ hc->tx_seqh = next;
- ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
- hctx->ccid2hctx_pipe);
+ ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
/*
* FIXME: The code below is broken and the variables have been removed
@@ -272,12 +299,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
*/
#if 0
/* Ack Ratio. Need to maintain a concept of how many windows we sent */
- hctx->ccid2hctx_arsent++;
+ hc->tx_arsent++;
/* We had an ack loss in this window... */
- if (hctx->ccid2hctx_ackloss) {
- if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
- hctx->ccid2hctx_arsent = 0;
- hctx->ccid2hctx_ackloss = 0;
+ if (hc->tx_ackloss) {
+ if (hc->tx_arsent >= hc->tx_cwnd) {
+ hc->tx_arsent = 0;
+ hc->tx_ackloss = 0;
}
} else {
/* No acks lost up to now... */
@@ -287,234 +314,203 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
dp->dccps_l_ack_ratio;
- denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
+ denom = hc->tx_cwnd * hc->tx_cwnd / denom;
- if (hctx->ccid2hctx_arsent >= denom) {
+ if (hc->tx_arsent >= denom) {
ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
- hctx->ccid2hctx_arsent = 0;
+ hc->tx_arsent = 0;
}
} else {
/* we can't increase ack ratio further [1] */
- hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
+ hc->tx_arsent = 0; /* or maybe set it to cwnd*/
}
}
#endif
- /* setup RTO timer */
- if (!timer_pending(&hctx->ccid2hctx_rtotimer))
- ccid2_start_rto_timer(sk);
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
do {
- struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
+ struct ccid2_seq *seqp = hc->tx_seqt;
- while (seqp != hctx->ccid2hctx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
+ while (seqp != hc->tx_seqh) {
+ ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
(unsigned long long)seqp->ccid2s_seq,
seqp->ccid2s_acked, seqp->ccid2s_sent);
seqp = seqp->ccid2s_next;
}
} while (0);
ccid2_pr_debug("=========\n");
- ccid2_hc_tx_check_sanity(hctx);
#endif
}
-/* XXX Lame code duplication!
- * returns -1 if none was found.
- * else returns the next offset to use in the function call.
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ * which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
*/
-static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
- unsigned char **vec, unsigned char *veclen)
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- unsigned char opt, len;
- unsigned char *value;
-
- BUG_ON(offset < 0);
- options += offset;
- opt_ptr = options;
- if (opt_ptr >= opt_end)
- return -1;
-
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_invalid_option;
-
- len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ long m = mrtt ? : 1;
+
+ if (hc->tx_srtt == 0) {
+ /* First measurement m */
+ hc->tx_srtt = m << 3;
+ hc->tx_mdev = m << 1;
+
+ hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+ hc->tx_rttvar = hc->tx_mdev_max;
+
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ } else {
+ /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+ m -= (hc->tx_srtt >> 3);
+ hc->tx_srtt += m;
+
+ /* Similarly, update scaled mdev with regard to |m| */
+ if (m < 0) {
+ m = -m;
+ m -= (hc->tx_mdev >> 2);
/*
- * Remove the type and len fields, leaving
- * just the value size
+ * This neutralises RTO increase when RTT < SRTT - mdev
+ * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+ * in Linux TCP", USENIX 2002, pp. 49-62).
*/
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (hc->tx_mdev >> 2);
+ }
+ hc->tx_mdev += m;
- if (opt_ptr > opt_end)
- goto out_invalid_option;
+ if (hc->tx_mdev > hc->tx_mdev_max) {
+ hc->tx_mdev_max = hc->tx_mdev;
+ if (hc->tx_mdev_max > hc->tx_rttvar)
+ hc->tx_rttvar = hc->tx_mdev_max;
}
- switch (opt) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- *vec = value;
- *veclen = len;
- return offset + (opt_ptr - options);
+ /*
+ * Decay RTTVAR at most once per flight, exploiting that
+ * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
+ * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
+ * GAR is a useful bound for FlightSize = pipe.
+ * AWL is probably too low here, as it over-estimates pipe.
+ */
+ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+ if (hc->tx_mdev_max < hc->tx_rttvar)
+ hc->tx_rttvar -= (hc->tx_rttvar -
+ hc->tx_mdev_max) >> 2;
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ hc->tx_mdev_max = tcp_rto_min(sk);
}
}
- return -1;
-
-out_invalid_option:
- DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
- return -1;
-}
-
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ /*
+ * Set RTO from SRTT and RTTVAR
+ * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+ * This agrees with RFC 4341, 5:
+ * "Because DCCP does not retransmit data, DCCP does not require
+ * TCP's recommended minimum timeout of one second".
+ */
+ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
- sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
- ccid2_pr_debug("deleted RTO timer\n");
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
}
-static inline void ccid2_new_ack(struct sock *sk,
- struct ccid2_seq *seqp,
- unsigned int *maxincr)
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+ unsigned int *maxincr)
{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
- if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
- if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
- hctx->ccid2hctx_cwnd += 1;
- *maxincr -= 1;
- hctx->ccid2hctx_packets_acked = 0;
- }
- } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
- hctx->ccid2hctx_cwnd += 1;
- hctx->ccid2hctx_packets_acked = 0;
- }
-
- /* update RTO */
- if (hctx->ccid2hctx_srtt == -1 ||
- time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
- unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
- int s;
-
- /* first measurement */
- if (hctx->ccid2hctx_srtt == -1) {
- ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
- r, jiffies,
- (unsigned long long)seqp->ccid2s_seq);
- ccid2_change_srtt(hctx, r);
- hctx->ccid2hctx_rttvar = r >> 1;
- } else {
- /* RTTVAR */
- long tmp = hctx->ccid2hctx_srtt - r;
- long srtt;
-
- if (tmp < 0)
- tmp *= -1;
-
- tmp >>= 2;
- hctx->ccid2hctx_rttvar *= 3;
- hctx->ccid2hctx_rttvar >>= 2;
- hctx->ccid2hctx_rttvar += tmp;
-
- /* SRTT */
- srtt = hctx->ccid2hctx_srtt;
- srtt *= 7;
- srtt >>= 3;
- tmp = r >> 3;
- srtt += tmp;
- ccid2_change_srtt(hctx, srtt);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
+
+ if (hc->tx_cwnd < dp->dccps_l_seq_win &&
+ r_seq_used < dp->dccps_r_seq_win) {
+ if (hc->tx_cwnd < hc->tx_ssthresh) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
+ hc->tx_cwnd += 1;
+ *maxincr -= 1;
+ hc->tx_packets_acked = 0;
+ }
+ } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
+ hc->tx_cwnd += 1;
+ hc->tx_packets_acked = 0;
}
- s = hctx->ccid2hctx_rttvar << 2;
- /* clock granularity is 1 when based on jiffies */
- if (!s)
- s = 1;
- hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
-
- /* must be at least a second */
- s = hctx->ccid2hctx_rto / HZ;
- /* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
- if (s < 1)
- hctx->ccid2hctx_rto = HZ;
-#endif
- /* max 60 seconds */
- if (s > 60)
- hctx->ccid2hctx_rto = HZ * 60;
-
- hctx->ccid2hctx_lastrtt = jiffies;
-
- ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
- hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
- hctx->ccid2hctx_rto, HZ, r);
}
- /* we got a new ack, so re-start RTO timer */
- ccid2_hc_tx_kill_rto_timer(sk);
- ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ /*
+ * Adjust the local sequence window and the ack ratio to allow about
+ * 5 times the number of packets in the network (RFC 4340 7.5.2)
+ */
+ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
+ else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
- if (hctx->ccid2hctx_pipe == 0)
- DCCP_BUG("pipe == 0");
- else
- hctx->ccid2hctx_pipe--;
+ if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
+ else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
- if (hctx->ccid2hctx_pipe == 0)
- ccid2_hc_tx_kill_rto_timer(sk);
+ /*
+ * FIXME: RTT is sampled several times per acknowledgment (for each
+ * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+ * This causes the RTT to be over-estimated, since the older entries
+ * in the Ack Vector have earlier sending times.
+ * The cleanest solution is to not use the ccid2s_sent field at all
+ * and instead use DCCP timestamps: requires changes in other places.
+ */
+ ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
}
static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
+ if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
return;
}
- hctx->ccid2hctx_last_cong = jiffies;
+ hc->tx_last_cong = ccid2_time_stamp;
+
+ hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
+ hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
- hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U;
- hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
- if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
- ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
+ switch (option) {
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+ option - DCCPO_ACK_VECTOR_0);
+ }
+ return 0;
}
static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+ struct dccp_ackvec_parsed *avp;
u64 ackno, seqno;
struct ccid2_seq *seqp;
- unsigned char *vector;
- unsigned char veclen;
- int offset = 0;
int done = 0;
unsigned int maxincr = 0;
- ccid2_hc_tx_check_sanity(hctx);
/* check reverse path congestion */
seqno = DCCP_SKB_CB(skb)->dccpd_seq;
@@ -523,49 +519,52 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* -sorbo.
*/
/* need to bootstrap */
- if (hctx->ccid2hctx_rpdupack == -1) {
- hctx->ccid2hctx_rpdupack = 0;
- hctx->ccid2hctx_rpseq = seqno;
+ if (hc->tx_rpdupack == -1) {
+ hc->tx_rpdupack = 0;
+ hc->tx_rpseq = seqno;
} else {
/* check if packet is consecutive */
- if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
- hctx->ccid2hctx_rpseq = seqno;
+ if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
+ hc->tx_rpseq = seqno;
/* it's a later packet */
- else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
- hctx->ccid2hctx_rpdupack++;
+ else if (after48(seqno, hc->tx_rpseq)) {
+ hc->tx_rpdupack++;
/* check if we got enough dupacks */
- if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
- hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
- hctx->ccid2hctx_rpseq = 0;
-
+ if (hc->tx_rpdupack >= NUMDUPACK) {
+ hc->tx_rpdupack = -1; /* XXX lame */
+ hc->tx_rpseq = 0;
+#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
+ /*
+ * FIXME: Ack Congestion Control is broken; in
+ * the current state instabilities occurred with
+ * Ack Ratios greater than 1; causing hang-ups
+ * and long RTO timeouts. This needs to be fixed
+ * before opening up dynamic changes. -- gerrit
+ */
ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+#endif
}
}
}
/* check forward path congestion */
- /* still didn't send out new data packets */
- if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
+ if (dccp_packet_without_ack(skb))
return;
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- case DCCP_PKT_DATAACK:
- break;
- default:
- return;
- }
+ /* still didn't send out new data packets */
+ if (hc->tx_seqh == hc->tx_seqt)
+ goto done;
ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- if (after48(ackno, hctx->ccid2hctx_high_ack))
- hctx->ccid2hctx_high_ack = ackno;
+ if (after48(ackno, hc->tx_high_ack))
+ hc->tx_high_ack = ackno;
- seqp = hctx->ccid2hctx_seqt;
+ seqp = hc->tx_seqt;
while (before48(seqp->ccid2s_seq, ackno)) {
seqp = seqp->ccid2s_next;
- if (seqp == hctx->ccid2hctx_seqh) {
- seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
break;
}
}
@@ -575,26 +574,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* packets per acknowledgement. Rounding up avoids that cwnd is not
* advanced when Ack Ratio is 1 and gives a slight edge otherwise.
*/
- if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
+ if (hc->tx_cwnd < hc->tx_ssthresh)
maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
/* go through all ack vectors */
- while ((offset = ccid2_ackvector(sk, skb, offset,
- &vector, &veclen)) != -1) {
+ list_for_each_entry(avp, &hc->tx_av_chunks, node) {
/* go through this ack vector */
- while (veclen--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl = SUB48(ackno, rl);
+ for (; avp->len--; avp->vec++) {
+ u64 ackno_end_rl = SUB48(ackno,
+ dccp_ackvec_runlen(avp->vec));
- ccid2_pr_debug("ackvec start:%llu end:%llu\n",
+ ccid2_pr_debug("ackvec %llu |%u,%u|\n",
(unsigned long long)ackno,
- (unsigned long long)ackno_end_rl);
+ dccp_ackvec_state(avp->vec) >> 6,
+ dccp_ackvec_runlen(avp->vec));
/* if the seqno we are analyzing is larger than the
* current ackno, then move towards the tail of our
* seqnos.
*/
while (after48(seqp->ccid2s_seq, ackno)) {
- if (seqp == hctx->ccid2hctx_seqt) {
+ if (seqp == hc->tx_seqt) {
done = 1;
break;
}
@@ -607,26 +606,24 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* run length
*/
while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = *vector &
- DCCP_ACKVEC_STATE_MASK;
+ const u8 state = dccp_ackvec_state(avp->vec);
/* new packet received or marked */
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+ if (state != DCCPAV_NOT_RECEIVED &&
!seqp->ccid2s_acked) {
- if (state ==
- DCCP_ACKVEC_STATE_ECN_MARKED) {
+ if (state == DCCPAV_ECN_MARKED)
ccid2_congestion_event(sk,
seqp);
- } else
+ else
ccid2_new_ack(sk, seqp,
&maxincr);
seqp->ccid2s_acked = 1;
ccid2_pr_debug("Got ack for %llu\n",
(unsigned long long)seqp->ccid2s_seq);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
- if (seqp == hctx->ccid2hctx_seqt) {
+ if (seqp == hc->tx_seqt) {
done = 1;
break;
}
@@ -636,7 +633,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
break;
ackno = SUB48(ackno_end_rl, 1);
- vector++;
}
if (done)
break;
@@ -645,11 +641,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
/* The state about what is acked should be correct now
* Check for NUMDUPACK
*/
- seqp = hctx->ccid2hctx_seqt;
- while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
+ seqp = hc->tx_seqt;
+ while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
seqp = seqp->ccid2s_next;
- if (seqp == hctx->ccid2hctx_seqh) {
- seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
break;
}
}
@@ -660,7 +656,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
if (done == NUMDUPACK)
break;
}
- if (seqp == hctx->ccid2hctx_seqt)
+ if (seqp == hc->tx_seqt)
break;
seqp = seqp->ccid2s_prev;
}
@@ -681,125 +677,108 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* one ack vector.
*/
ccid2_congestion_event(sk, seqp);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
- if (seqp == hctx->ccid2hctx_seqt)
+ if (seqp == hc->tx_seqt)
break;
seqp = seqp->ccid2s_prev;
}
- hctx->ccid2hctx_seqt = last_acked;
+ hc->tx_seqt = last_acked;
}
/* trim acked packets in tail */
- while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
- if (!hctx->ccid2hctx_seqt->ccid2s_acked)
+ while (hc->tx_seqt != hc->tx_seqh) {
+ if (!hc->tx_seqt->ccid2s_acked)
break;
- hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
+ hc->tx_seqt = hc->tx_seqt->ccid2s_next;
}
- ccid2_hc_tx_check_sanity(hctx);
+ /* restart RTO timer if not all outstanding data has been acked */
+ if (hc->tx_pipe == 0)
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+ else
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+done:
+ /* check if incoming Acks allow pending packets to be sent */
+ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
- struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid);
+ struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
struct dccp_sock *dp = dccp_sk(sk);
u32 max_ratio;
/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
- hctx->ccid2hctx_ssthresh = ~0U;
+ hc->tx_ssthresh = ~0U;
- /*
- * RFC 4341, 5: "The cwnd parameter is initialized to at most four
- * packets for new connections, following the rules from [RFC3390]".
- * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
- */
- hctx->ccid2hctx_cwnd = min(4U, max(2U, 4380U / dp->dccps_mss_cache));
+ /* Use larger initial windows (RFC 4341, section 5). */
+ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+ hc->tx_expected_wnd = hc->tx_cwnd;
/* Make sure that Ack Ratio is enabled and within bounds. */
- max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
+ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
dp->dccps_l_ack_ratio = max_ratio;
/* XXX init ~ to window size... */
- if (ccid2_hc_tx_alloc_seq(hctx))
+ if (ccid2_hc_tx_alloc_seq(hc))
return -ENOMEM;
- hctx->ccid2hctx_rto = 3 * HZ;
- ccid2_change_srtt(hctx, -1);
- hctx->ccid2hctx_rttvar = -1;
- hctx->ccid2hctx_rpdupack = -1;
- hctx->ccid2hctx_last_cong = jiffies;
- setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
+ hc->tx_rto = DCCP_TIMEOUT_INIT;
+ hc->tx_rpdupack = -1;
+ hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
+ hc->tx_cwnd_used = 0;
+ setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
(unsigned long)sk);
-
- ccid2_hc_tx_check_sanity(hctx);
+ INIT_LIST_HEAD(&hc->tx_av_chunks);
return 0;
}
static void ccid2_hc_tx_exit(struct sock *sk)
{
- struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
int i;
- ccid2_hc_tx_kill_rto_timer(sk);
+ sk_stop_timer(sk, &hc->tx_rtotimer);
- for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
- kfree(hctx->ccid2hctx_seqbuf[i]);
- hctx->ccid2hctx_seqbufc = 0;
+ for (i = 0; i < hc->tx_seqbufc; i++)
+ kfree(hc->tx_seqbuf[i]);
+ hc->tx_seqbufc = 0;
}
static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_rx_sock *hcrx = ccid2_hc_rx_sk(sk);
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- hcrx->ccid2hcrx_data++;
- if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
- dccp_send_ack(sk);
- hcrx->ccid2hcrx_data = 0;
- }
- break;
+ struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
+
+ if (!dccp_data_packet(skb))
+ return;
+
+ if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
+ dccp_send_ack(sk);
+ hc->rx_num_data_pkts = 0;
}
}
-static struct ccid_operations ccid2 = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "TCP-like",
- .ccid_owner = THIS_MODULE,
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
+struct ccid_operations ccid2_ops = {
+ .ccid_id = DCCPC_CCID2,
+ .ccid_name = "TCP-like",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
+ .ccid_hc_tx_init = ccid2_hc_tx_init,
+ .ccid_hc_tx_exit = ccid2_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
+ .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+ .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
+ .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
};
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, bool, 0444);
-MODULE_PARM_DESC(ccid2_debug, "Enable debug messages");
+module_param(ccid2_debug, bool, 0644);
+MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
#endif
-
-static __init int ccid2_module_init(void)
-{
- return ccid_register(&ccid2);
-}
-module_init(ccid2_module_init);
-
-static __exit void ccid2_module_exit(void)
-{
- ccid_unregister(&ccid2);
-}
-module_exit(ccid2_module_exit);
-
-MODULE_AUTHOR("Andrea Bittau <a.bittau@cs.ucl.ac.uk>");
-MODULE_DESCRIPTION("DCCP TCP-Like (CCID2) CCID");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("net-dccp-ccid-2");
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 2c94ca02901..18c97543e52 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/ccid2.h
- *
* Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* This program is free software; you can redistribute it and/or modify
@@ -20,18 +18,23 @@
#ifndef _DCCP_CCID2_H_
#define _DCCP_CCID2_H_
-#include <linux/dccp.h>
#include <linux/timer.h>
#include <linux/types.h>
#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp tcp_time_stamp
+
/* NUMDUPACK parameter from RFC 4341, p. 6 */
#define NUMDUPACK 3
-struct sock;
-
struct ccid2_seq {
u64 ccid2s_seq;
- unsigned long ccid2s_sent;
+ u32 ccid2s_sent;
int ccid2s_acked;
struct ccid2_seq *ccid2s_prev;
struct ccid2_seq *ccid2s_next;
@@ -40,36 +43,82 @@ struct ccid2_seq {
#define CCID2_SEQBUF_LEN 1024
#define CCID2_SEQBUF_MAX 128
-/** struct ccid2_hc_tx_sock - CCID2 TX half connection
- *
- * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
- * @ccid2hctx_lastrtt -time RTT was last measured
- * @ccid2hctx_rpseq - last consecutive seqno
- * @ccid2hctx_rpdupack - dupacks since rpseq
-*/
+/*
+ * Multiple of congestion window to keep the sequence window at
+ * (RFC 4340 7.5.2)
+ */
+#define CCID2_WIN_CHANGE_FACTOR 5
+
+/**
+ * struct ccid2_hc_tx_sock - CCID2 TX half connection
+ * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
+ * @tx_srtt: smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev: smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max: maximum of @mdev during one flight
+ * @tx_rttvar: moving average/maximum of @mdev_max
+ * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq: to decay RTTVAR at most once per flight
+ * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
+ * @tx_expected_wnd: moving average of @tx_cwnd_used
+ * @tx_cwnd_stamp: to track idle periods in CWV
+ * @tx_lsndtime: last time (in jiffies) a data packet was sent
+ * @tx_rpseq: last consecutive seqno
+ * @tx_rpdupack: dupacks since rpseq
+ * @tx_av_chunks: list of Ack Vectors received on current skb
+ */
struct ccid2_hc_tx_sock {
- u32 ccid2hctx_cwnd;
- u32 ccid2hctx_ssthresh;
- u32 ccid2hctx_pipe;
- u32 ccid2hctx_packets_acked;
- struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
- int ccid2hctx_seqbufc;
- struct ccid2_seq *ccid2hctx_seqh;
- struct ccid2_seq *ccid2hctx_seqt;
- long ccid2hctx_rto;
- long ccid2hctx_srtt;
- long ccid2hctx_rttvar;
- unsigned long ccid2hctx_lastrtt;
- struct timer_list ccid2hctx_rtotimer;
- u64 ccid2hctx_rpseq;
- int ccid2hctx_rpdupack;
- unsigned long ccid2hctx_last_cong;
- u64 ccid2hctx_high_ack;
+ u32 tx_cwnd;
+ u32 tx_ssthresh;
+ u32 tx_pipe;
+ u32 tx_packets_acked;
+ struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
+ int tx_seqbufc;
+ struct ccid2_seq *tx_seqh;
+ struct ccid2_seq *tx_seqt;
+
+ /* RTT measurement: variables/principles are the same as in TCP */
+ u32 tx_srtt,
+ tx_mdev,
+ tx_mdev_max,
+ tx_rttvar,
+ tx_rto;
+ u64 tx_rtt_seq:48;
+ struct timer_list tx_rtotimer;
+
+ /* Congestion Window validation (optional, RFC 2861) */
+ u32 tx_cwnd_used,
+ tx_expected_wnd,
+ tx_cwnd_stamp,
+ tx_lsndtime;
+
+ u64 tx_rpseq;
+ int tx_rpdupack;
+ u32 tx_last_cong;
+ u64 tx_high_ack;
+ struct list_head tx_av_chunks;
};
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+ return hc->tx_pipe >= hc->tx_cwnd;
+}
+
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
+/**
+ * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
+ * @rx_num_data_pkts: number of data packets received since last feedback
+ */
struct ccid2_hc_rx_sock {
- int ccid2hcrx_data;
+ u32 rx_num_data_pkts;
};
static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index e76f460af0e..119c04317d4 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/ccid3.c
- *
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
@@ -40,7 +38,7 @@
#include <asm/unaligned.h>
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static int ccid3_debug;
+static bool ccid3_debug;
#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
#else
#define ccid3_pr_debug(format, a...)
@@ -52,11 +50,10 @@ static int ccid3_debug;
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
- static char *ccid3_state_names[] = {
+ static const char *const ccid3_state_names[] = {
[TFRC_SSTATE_NO_SENT] = "NO_SENT",
[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
[TFRC_SSTATE_FBACK] = "FBACK",
- [TFRC_SSTATE_TERM] = "TERM",
};
return ccid3_state_names[state];
@@ -66,14 +63,14 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
static void ccid3_hc_tx_set_state(struct sock *sk,
enum ccid3_hc_tx_states state)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ enum ccid3_hc_tx_states oldstate = hc->tx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
ccid3_tx_state_name(state));
WARN_ON(state == oldstate);
- hctx->ccid3hctx_state = state;
+ hc->tx_state = state;
}
/*
@@ -87,42 +84,36 @@ static void ccid3_hc_tx_set_state(struct sock *sk,
*/
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- const __u32 w_init = min_t(__u32, 4 * hctx->ccid3hctx_s,
- max_t(__u32, 2 * hctx->ccid3hctx_s, 4380));
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
- return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
+ return scaled_div(w_init << 6, hc->tx_rtt);
}
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
*/
-static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
+static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
{
- /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
- hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
- hctx->ccid3hctx_x);
-
- /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
- hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
- TFRC_OPSYS_HALF_TIME_GRAN);
-
- ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
- hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
- hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
+ hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
+ DCCP_BUG_ON(hc->tx_t_ipi == 0);
+ ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+ hc->tx_s, (unsigned int)(hc->tx_x >> 6));
}
-static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
+static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
{
- u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
- return delta / hctx->ccid3hctx_rtt;
+ return delta / hc->tx_rtt;
}
/**
* ccid3_hc_tx_update_x - Update allowed sending rate X
* @stamp: most recent time if available - can be left NULL.
+ *
* This function tracks draft rfc3448bis, check there for latest details.
*
* Note: X and X_recv are both stored in units of 64 * bytes/second, to support
@@ -132,9 +123,9 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
*/
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- __u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
- const __u64 old_x = hctx->ccid3hctx_x;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ __u64 min_rate = 2 * hc->tx_x_recv;
+ const __u64 old_x = hc->tx_x;
ktime_t now = stamp ? *stamp : ktime_get_real();
/*
@@ -143,80 +134,71 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
* a sender is idle if it has not sent anything over a 2-RTT-period.
* For consistency with X and X_recv, min_rate is also scaled by 2^6.
*/
- if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
+ if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
min_rate = rfc3390_initial_rate(sk);
- min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
+ min_rate = max(min_rate, 2 * hc->tx_x_recv);
}
- if (hctx->ccid3hctx_p > 0) {
+ if (hc->tx_p > 0) {
- hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
- min_rate);
- hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
- (((__u64)hctx->ccid3hctx_s) << 6) /
- TFRC_T_MBI);
+ hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
+ hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
- } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
- - (s64)hctx->ccid3hctx_rtt >= 0) {
+ } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
- hctx->ccid3hctx_x =
- max(min(2 * hctx->ccid3hctx_x, min_rate),
- scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
- hctx->ccid3hctx_rtt));
- hctx->ccid3hctx_t_ld = now;
+ hc->tx_x = min(2 * hc->tx_x, min_rate);
+ hc->tx_x = max(hc->tx_x,
+ scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
+ hc->tx_t_ld = now;
}
- if (hctx->ccid3hctx_x != old_x) {
+ if (hc->tx_x != old_x) {
ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
- "X_recv=%u\n", (unsigned)(old_x >> 6),
- (unsigned)(hctx->ccid3hctx_x >> 6),
- hctx->ccid3hctx_x_calc,
- (unsigned)(hctx->ccid3hctx_x_recv >> 6));
+ "X_recv=%u\n", (unsigned int)(old_x >> 6),
+ (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6));
- ccid3_update_send_interval(hctx);
+ ccid3_update_send_interval(hc);
}
}
-/*
- * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1)
+/**
+ * ccid3_hc_tx_update_s - Track the mean packet size `s'
* @len: DCCP packet payload size in bytes
+ *
+ * cf. RFC 4342, 5.3 and RFC 3448, 4.1
*/
-static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
{
- const u16 old_s = hctx->ccid3hctx_s;
+ const u16 old_s = hc->tx_s;
- hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
+ hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
- if (hctx->ccid3hctx_s != old_s)
- ccid3_update_send_interval(hctx);
+ if (hc->tx_s != old_s)
+ ccid3_update_send_interval(hc);
}
/*
* Update Window Counter using the algorithm from [RFC 4342, 8.1].
- * The algorithm is not applicable if RTT < 4 microseconds.
+ * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
*/
-static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
+static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
ktime_t now)
{
- u32 quarter_rtts;
-
- if (unlikely(hctx->ccid3hctx_rtt < 4)) /* avoid divide-by-zero */
- return;
-
- quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
- quarter_rtts /= hctx->ccid3hctx_rtt / 4;
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
+ quarter_rtts = (4 * delta) / hc->tx_rtt;
if (quarter_rtts > 0) {
- hctx->ccid3hctx_t_last_win_count = now;
- hctx->ccid3hctx_last_win_count += min_t(u32, quarter_rtts, 5);
- hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */
+ hc->tx_t_last_win_count = now;
+ hc->tx_last_win_count += min(quarter_rtts, 5U);
+ hc->tx_last_win_count &= 0xF; /* mod 16 */
}
}
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
unsigned long t_nfb = USEC_PER_SEC / 5;
bh_lock_sock(sk);
@@ -226,25 +208,27 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
goto restart_timer;
}
- ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state));
+ ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
+ ccid3_tx_state_name(hc->tx_state));
- if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+ /* Ignore and do not restart after leaving the established state */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
goto out;
+ /* Reset feedback state to "no feedback received" */
+ if (hc->tx_state == TFRC_SSTATE_FBACK)
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
/*
* Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+ * RTO is 0 if and only if no feedback has been received yet.
*/
- if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */
- hctx->ccid3hctx_p == 0) {
+ if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
/* halve send rate directly */
- hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
- (((__u64)hctx->ccid3hctx_s) << 6) /
- TFRC_T_MBI);
- ccid3_update_send_interval(hctx);
+ hc->tx_x = max(hc->tx_x / 2,
+ (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+ ccid3_update_send_interval(hc);
} else {
/*
* Modify the cached value of X_recv
@@ -256,49 +240,47 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
*
* Note that X_recv is scaled by 2^6 while X_calc is not
*/
- BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
-
- if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
- hctx->ccid3hctx_x_recv =
- max(hctx->ccid3hctx_x_recv / 2,
- (((__u64)hctx->ccid3hctx_s) << 6) /
- (2 * TFRC_T_MBI));
+ if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
+ hc->tx_x_recv =
+ max(hc->tx_x_recv / 2,
+ (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
else {
- hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
- hctx->ccid3hctx_x_recv <<= 4;
+ hc->tx_x_recv = hc->tx_x_calc;
+ hc->tx_x_recv <<= 4;
}
ccid3_hc_tx_update_x(sk, NULL);
}
ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
- (unsigned long long)hctx->ccid3hctx_x);
+ (unsigned long long)hc->tx_x);
/*
* Set new timeout for the nofeedback timer.
* See comments in packet_recv() regarding the value of t_RTO.
*/
- if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */
+ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
t_nfb = TFRC_INITIAL_TIMEOUT;
else
- t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
restart_timer:
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
jiffies + usecs_to_jiffies(t_nfb));
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-/*
- * returns
- * > 0: delay (in msecs) that should pass before actually sending
- * = 0: can send immediately
- * < 0: error condition; do not send packet
+/**
+ * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ *
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
*/
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
ktime_t now = ktime_get_real();
s64 delay;
@@ -310,18 +292,16 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
if (unlikely(skb->len == 0))
return -EBADMSG;
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_SENT:
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- (jiffies +
- usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
- hctx->ccid3hctx_last_win_count = 0;
- hctx->ccid3hctx_t_last_win_count = now;
+ if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
+ usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
+ hc->tx_last_win_count = 0;
+ hc->tx_t_last_win_count = now;
/* Set t_0 for initial packet */
- hctx->ccid3hctx_t_nom = now;
+ hc->tx_t_nom = now;
- hctx->ccid3hctx_s = skb->len;
+ hc->tx_s = skb->len;
/*
* Use initial RTT sample when available: recommended by erratum
@@ -330,123 +310,108 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
*/
if (dp->dccps_syn_rtt) {
ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
- hctx->ccid3hctx_rtt = dp->dccps_syn_rtt;
- hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
- hctx->ccid3hctx_t_ld = now;
+ hc->tx_rtt = dp->dccps_syn_rtt;
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
} else {
- /* Sender does not have RTT sample: X_pps = 1 pkt/sec */
- hctx->ccid3hctx_x = hctx->ccid3hctx_s;
- hctx->ccid3hctx_x <<= 6;
+ /*
+ * Sender does not have RTT sample:
+ * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+ * is needed in several parts (e.g. window counter);
+ * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+ */
+ hc->tx_rtt = DCCP_FALLBACK_RTT;
+ hc->tx_x = hc->tx_s;
+ hc->tx_x <<= 6;
}
- ccid3_update_send_interval(hctx);
+ ccid3_update_send_interval(hc);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- break;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
+
+ } else {
+ delay = ktime_us_delta(hc->tx_t_nom, now);
ccid3_pr_debug("delay=%ld\n", (long)delay);
/*
- * Scheduling of packet transmissions [RFC 3448, 4.6]
+ * Scheduling of packet transmissions (RFC 5348, 8.3)
*
* if (t_now > t_nom - delta)
* // send the packet now
* else
* // send the packet in (t_nom - t_now) milliseconds.
*/
- if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
- return (u32)delay / 1000L;
+ if (delay >= TFRC_T_DELTA)
+ return (u32)delay / USEC_PER_MSEC;
- ccid3_hc_tx_update_win_count(hctx, now);
- break;
- case TFRC_SSTATE_TERM:
- DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
- return -EINVAL;
+ ccid3_hc_tx_update_win_count(hc, now);
}
/* prepare to send now (add options etc.) */
dp->dccps_hc_tx_insert_options = 1;
- DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+ DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
/* set the nominal send time for the next following packet */
- hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
- hctx->ccid3hctx_t_ipi);
- return 0;
+ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
+ return CCID_PACKET_SEND_AT_ONCE;
}
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
- unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- ccid3_hc_tx_update_s(hctx, len);
+ ccid3_hc_tx_update_s(hc, len);
- if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
+ if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
DCCP_CRIT("packet history - out of memory!");
}
static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_hist_entry *acked;
ktime_t now;
unsigned long t_nfb;
- u32 pinv, r_sample;
+ u32 r_sample;
/* we are only interested in ACKs */
if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
return;
- /* ... and only in the established state */
- if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
- hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
- return;
-
- opt_recv = &hctx->ccid3hctx_options_received;
- now = ktime_get_real();
-
- /* Estimate RTT from history if ACK number is valid */
- r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
- DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
- if (r_sample == 0) {
- DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
- dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /*
+ * Locate the acknowledged packet in the TX history.
+ *
+ * Returning "entry not found" here can for instance happen when
+ * - the host has not sent out anything (e.g. a passive server),
+ * - the Ack is outdated (packet with higher Ack number was received),
+ * - it is a bogus Ack (for a packet not sent on this connection).
+ */
+ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+ if (acked == NULL)
return;
- }
+ /* For the sake of RTT sampling, ignore/remove all older entries */
+ tfrc_tx_hist_purge(&acked->next);
- /* Update receive rate in units of 64 * bytes/second */
- hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
- hctx->ccid3hctx_x_recv <<= 6;
+ /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+ now = ktime_get_real();
+ r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
+ hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
- /* Update loss event rate (which is scaled by 1e6) */
- pinv = opt_recv->ccid3or_loss_event_rate;
- if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
- hctx->ccid3hctx_p = 0;
- else /* can not exceed 100% */
- hctx->ccid3hctx_p = scaled_div(1, pinv);
- /*
- * Validate new RTT sample and update moving average
- */
- r_sample = dccp_sample_rtt(sk, r_sample);
- hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
/*
* Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
*/
- if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+ if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
- if (hctx->ccid3hctx_t_rto == 0) {
+ if (hc->tx_t_rto == 0) {
/*
* Initial feedback packet: Larger Initial Windows (4.2)
*/
- hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
- hctx->ccid3hctx_t_ld = now;
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
- ccid3_update_send_interval(hctx);
+ ccid3_update_send_interval(hc);
goto done_computing_x;
- } else if (hctx->ccid3hctx_p == 0) {
+ } else if (hc->tx_p == 0) {
/*
* First feedback after nofeedback timer expiry (4.3)
*/
@@ -455,25 +420,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
}
/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
- if (hctx->ccid3hctx_p > 0)
- hctx->ccid3hctx_x_calc =
- tfrc_calc_x(hctx->ccid3hctx_s,
- hctx->ccid3hctx_rtt,
- hctx->ccid3hctx_p);
+ if (hc->tx_p > 0)
+ hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
ccid3_hc_tx_update_x(sk, &now);
done_computing_x:
ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
"p=%u, X_calc=%u, X_recv=%u, X=%u\n",
- dccp_role(sk),
- sk, hctx->ccid3hctx_rtt, r_sample,
- hctx->ccid3hctx_s, hctx->ccid3hctx_p,
- hctx->ccid3hctx_x_calc,
- (unsigned)(hctx->ccid3hctx_x_recv >> 6),
- (unsigned)(hctx->ccid3hctx_x >> 6));
+ dccp_role(sk), sk, hc->tx_rtt, r_sample,
+ hc->tx_s, hc->tx_p, hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6),
+ (unsigned int)(hc->tx_x >> 6));
/* unschedule no feedback timer */
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
/*
* As we have calculated new ipi, delta, t_nom it is possible
@@ -482,143 +442,109 @@ done_computing_x:
sk->sk_write_space(sk);
/*
- * Update timeout interval for the nofeedback timer.
- * We use a configuration option to increase the lower bound.
- * This can help avoid triggering the nofeedback timer too
- * often ('spinning') on LANs with small RTTs.
+ * Update timeout interval for the nofeedback timer. In order to control
+ * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+ * tunable RTAX_RTO_MIN value as the lower bound.
*/
- hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
- (CONFIG_IP_DCCP_CCID3_RTO *
- (USEC_PER_SEC / 1000)));
+ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+ USEC_PER_SEC/HZ * tcp_rto_min(sk));
/*
* Schedule no feedback timer to expire in
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
*/
- t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
"expire in %lu jiffies (%luus)\n",
- dccp_role(sk),
- sk, usecs_to_jiffies(t_nfb), t_nfb);
+ dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
jiffies + usecs_to_jiffies(t_nfb));
}
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
- unsigned char len, u16 idx,
- unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
{
- int rc = 0;
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
__be32 opt_val;
- opt_recv = &hctx->ccid3hctx_options_received;
-
- if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
- opt_recv->ccid3or_seqno = dp->dccps_gsr;
- opt_recv->ccid3or_loss_event_rate = ~0;
- opt_recv->ccid3or_loss_intervals_idx = 0;
- opt_recv->ccid3or_loss_intervals_len = 0;
- opt_recv->ccid3or_receive_rate = 0;
- }
-
switch (option) {
+ case TFRC_OPT_RECEIVE_RATE:
case TFRC_OPT_LOSS_EVENT_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_LOSS_EVENT_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
- ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_event_rate);
+ /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+ if (packet_type == DCCP_PKT_DATA)
+ break;
+ if (unlikely(optlen != 4)) {
+ DCCP_WARN("%s(%p), invalid len %d for %u\n",
+ dccp_role(sk), sk, optlen, option);
+ return -EINVAL;
}
- break;
- case TFRC_OPT_LOSS_INTERVALS:
- opt_recv->ccid3or_loss_intervals_idx = idx;
- opt_recv->ccid3or_loss_intervals_len = len;
- ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_intervals_idx,
- opt_recv->ccid3or_loss_intervals_len);
- break;
- case TFRC_OPT_RECEIVE_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_RECEIVE_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+ opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+ if (option == TFRC_OPT_RECEIVE_RATE) {
+ /* Receive Rate is kept in units of 64 bytes/second */
+ hc->tx_x_recv = opt_val;
+ hc->tx_x_recv <<= 6;
+
ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_receive_rate);
+ dccp_role(sk), sk, opt_val);
+ } else {
+ /* Update the fixpoint Loss Event Rate fraction */
+ hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+ ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
}
- break;
}
-
- return rc;
+ return 0;
}
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
- struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
+ struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
- hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
- hctx->ccid3hctx_hist = NULL;
- setup_timer(&hctx->ccid3hctx_no_feedback_timer,
+ hc->tx_state = TFRC_SSTATE_NO_SENT;
+ hc->tx_hist = NULL;
+ setup_timer(&hc->tx_no_feedback_timer,
ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
-
return 0;
}
static void ccid3_hc_tx_exit(struct sock *sk)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+ tfrc_tx_hist_purge(&hc->tx_hist);
}
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
- struct ccid3_hc_tx_sock *hctx;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hctx = ccid3_hc_tx_sk(sk);
- info->tcpi_rto = hctx->ccid3hctx_t_rto;
- info->tcpi_rtt = hctx->ccid3hctx_rtt;
+ info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+ info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
}
static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_tx_sock *hctx;
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_info tfrc;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hctx = ccid3_hc_tx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(hctx->ccid3hctx_tfrc))
+ if (len < sizeof(tfrc))
return -EINVAL;
- len = sizeof(hctx->ccid3hctx_tfrc);
- val = &hctx->ccid3hctx_tfrc;
+ memset(&tfrc, 0, sizeof(tfrc));
+ tfrc.tfrctx_x = hc->tx_x;
+ tfrc.tfrctx_x_recv = hc->tx_x_recv;
+ tfrc.tfrctx_x_calc = hc->tx_x_calc;
+ tfrc.tfrctx_rtt = hc->tx_rtt;
+ tfrc.tfrctx_p = hc->tx_p;
+ tfrc.tfrctx_rto = hc->tx_t_rto;
+ tfrc.tfrctx_ipi = hc->tx_t_ipi;
+ len = sizeof(tfrc);
+ val = &tfrc;
break;
default:
return -ENOPROTOOPT;
@@ -645,10 +571,9 @@ enum ccid3_fback_type {
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
- static char *ccid3_rx_state_names[] = {
+ static const char *const ccid3_rx_state_names[] = {
[TFRC_RSTATE_NO_DATA] = "NO_DATA",
[TFRC_RSTATE_DATA] = "DATA",
- [TFRC_RSTATE_TERM] = "TERM",
};
return ccid3_rx_state_names[state];
@@ -658,34 +583,29 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
static void ccid3_hc_rx_set_state(struct sock *sk,
enum ccid3_hc_rx_states state)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ enum ccid3_hc_rx_states oldstate = hc->rx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
ccid3_rx_state_name(state));
WARN_ON(state == oldstate);
- hcrx->ccid3hcrx_state = state;
+ hc->rx_state = state;
}
static void ccid3_hc_rx_send_feedback(struct sock *sk,
const struct sk_buff *skb,
enum ccid3_fback_type fbtype)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- ktime_t now;
+ ktime_t now = ktime_get_real();
s64 delta = 0;
- if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
- return;
-
- now = ktime_get_real();
-
switch (fbtype) {
case CCID3_FBACK_INITIAL:
- hcrx->ccid3hcrx_x_recv = 0;
- hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */
+ hc->rx_x_recv = 0;
+ hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
break;
case CCID3_FBACK_PARAM_CHANGE:
/*
@@ -698,27 +618,26 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
* the number of bytes since last feedback.
* This is a safe fallback, since X is bounded above by X_calc.
*/
- if (hcrx->ccid3hcrx_x_recv > 0)
+ if (hc->rx_x_recv > 0)
break;
/* fall through */
case CCID3_FBACK_PERIODIC:
- delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
+ delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
if (delta <= 0)
DCCP_BUG("delta (%ld) <= 0", (long)delta);
else
- hcrx->ccid3hcrx_x_recv =
- scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+ hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
break;
default:
return;
}
ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
- hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
+ hc->rx_x_recv, hc->rx_pinv);
- hcrx->ccid3hcrx_tstamp_last_feedback = now;
- hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
- hcrx->ccid3hcrx_bytes_recv = 0;
+ hc->rx_tstamp_last_feedback = now;
+ hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
+ hc->rx_bytes_recv = 0;
dp->dccps_hc_rx_insert_options = 1;
dccp_send_ack(sk);
@@ -726,30 +645,29 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
{
- const struct ccid3_hc_rx_sock *hcrx;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
__be32 x_recv, pinv;
if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
return 0;
- hcrx = ccid3_hc_rx_sk(sk);
-
if (dccp_packet_without_ack(skb))
return 0;
- x_recv = htonl(hcrx->ccid3hcrx_x_recv);
- pinv = htonl(hcrx->ccid3hcrx_pinv);
+ x_recv = htonl(hc->rx_x_recv);
+ pinv = htonl(hc->rx_pinv);
- if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+ if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
&pinv, sizeof(pinv)) ||
- dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+ dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
&x_recv, sizeof(x_recv)))
return -1;
return 0;
}
-/** ccid3_first_li - Implements [RFC 3448, 6.3.1]
+/**
+ * ccid3_first_li - Implements [RFC 5348, 6.3.1]
*
* Determine the length of the first loss interval via inverse lookup.
* Assume that X_recv can be computed by the throughput equation
@@ -760,26 +678,27 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
*/
static u32 ccid3_first_li(struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
u32 x_recv, p, delta;
u64 fval;
- if (hcrx->ccid3hcrx_rtt == 0) {
+ if (hc->rx_rtt == 0) {
DCCP_WARN("No RTT estimate available, using fallback RTT\n");
- hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
+ hc->rx_rtt = DCCP_FALLBACK_RTT;
}
- delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
- x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+ delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback));
+ x_recv = scaled_div32(hc->rx_bytes_recv, delta);
if (x_recv == 0) { /* would also trigger divide-by-zero */
DCCP_WARN("X_recv==0\n");
- if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
+ if (hc->rx_x_recv == 0) {
DCCP_BUG("stored value of X_recv is zero");
return ~0U;
}
+ x_recv = hc->rx_x_recv;
}
- fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
+ fval = scaled_div(hc->rx_s, hc->rx_rtt);
fval = scaled_div32(fval, x_recv);
p = tfrc_calc_x_reverse_lookup(fval);
@@ -791,19 +710,19 @@ static u32 ccid3_first_li(struct sock *sk)
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
- const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
+ const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
const bool is_data_packet = dccp_data_packet(skb);
- if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
+ if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
if (is_data_packet) {
const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
do_feedback = CCID3_FBACK_INITIAL;
ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
- hcrx->ccid3hcrx_s = payload;
+ hc->rx_s = payload;
/*
- * Not necessary to update ccid3hcrx_bytes_recv here,
+ * Not necessary to update rx_bytes_recv here,
* since X_recv = 0 for the first feedback packet (cf.
* RFC 3448, 6.3) -- gerrit
*/
@@ -811,7 +730,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
goto update_records;
}
- if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
+ if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
return; /* done receiving */
if (is_data_packet) {
@@ -819,23 +738,21 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
/*
* Update moving-average of s and the sum of received payload bytes
*/
- hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
- hcrx->ccid3hcrx_bytes_recv += payload;
+ hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
+ hc->rx_bytes_recv += payload;
}
/*
- * Handle pending losses and otherwise check for new loss
+ * Perform loss detection and handle pending losses
*/
- if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) &&
- tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist,
- &hcrx->ccid3hcrx_li_hist,
- skb, ndp, ccid3_first_li, sk) ) {
+ if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
+ skb, ndp, ccid3_first_li, sk)) {
do_feedback = CCID3_FBACK_PARAM_CHANGE;
goto done_receiving;
}
- if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp))
- goto update_records;
+ if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
+ return; /* done receiving */
/*
* Handle data packets: RTT sampling and monitoring p
@@ -843,17 +760,17 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
if (unlikely(!is_data_packet))
goto update_records;
- if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
- const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
+ if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
+ const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
/*
* Empty loss history: no loss so far, hence p stays 0.
* Sample RTT values, since an RTT estimate is required for the
* computation of p when the first loss occurs; RFC 3448, 6.3.1.
*/
if (sample != 0)
- hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
+ hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
- } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
+ } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
/*
* Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
* has decreased (resp. p has increased), send feedback now.
@@ -864,11 +781,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
/*
* Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
*/
- if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
+ if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
do_feedback = CCID3_FBACK_PERIODIC;
update_records:
- tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
+ tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
done_receiving:
if (do_feedback)
@@ -877,57 +794,42 @@ done_receiving:
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
+ struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
- hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
- tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
- return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
+ hc->rx_state = TFRC_RSTATE_NO_DATA;
+ tfrc_lh_init(&hc->rx_li_hist);
+ return tfrc_rx_hist_alloc(&hc->rx_hist);
}
static void ccid3_hc_rx_exit(struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
- tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
- tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
+ tfrc_rx_hist_purge(&hc->rx_hist);
+ tfrc_lh_cleanup(&hc->rx_li_hist);
}
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct ccid3_hc_rx_sock *hcrx;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hcrx = ccid3_hc_rx_sk(sk);
- info->tcpi_ca_state = hcrx->ccid3hcrx_state;
+ info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
+ info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
}
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_rx_sock *hcrx;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct tfrc_rx_info rx_info;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hcrx = ccid3_hc_rx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_RX_INFO:
if (len < sizeof(rx_info))
return -EINVAL;
- rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
- rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt;
- rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
- scaled_div(1, hcrx->ccid3hcrx_pinv);
+ rx_info.tfrcrx_x_recv = hc->rx_x_recv;
+ rx_info.tfrcrx_rtt = hc->rx_rtt;
+ rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
len = sizeof(rx_info);
val = &rx_info;
break;
@@ -941,10 +843,9 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
return 0;
}
-static struct ccid_operations ccid3 = {
+struct ccid_operations ccid3_ops = {
.ccid_id = DCCPC_CCID3,
.ccid_name = "TCP-Friendly Rate Control",
- .ccid_owner = THIS_MODULE,
.ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
.ccid_hc_tx_init = ccid3_hc_tx_init,
.ccid_hc_tx_exit = ccid3_hc_tx_exit,
@@ -964,24 +865,6 @@ static struct ccid_operations ccid3 = {
};
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, bool, 0444);
-MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
+module_param(ccid3_debug, bool, 0644);
+MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
#endif
-
-static __init int ccid3_module_init(void)
-{
- return ccid_register(&ccid3);
-}
-module_init(ccid3_module_init);
-
-static __exit void ccid3_module_exit(void)
-{
- ccid_unregister(&ccid3);
-}
-module_exit(ccid3_module_exit);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
- "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
-MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 49ca32bd7e7..1a9933c2967 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/ccid3.h
- *
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
*
@@ -44,128 +42,119 @@
#include "lib/tfrc.h"
#include "../ccid.h"
-/* Two seconds as per RFC 3448 4.2 */
+/* Two seconds as per RFC 5348, 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
-
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
+#endif
+
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,
TFRC_OPT_RECEIVE_RATE = 194,
};
-struct ccid3_options_received {
- u64 ccid3or_seqno:48,
- ccid3or_loss_intervals_idx:16;
- u16 ccid3or_loss_intervals_len;
- u32 ccid3or_loss_event_rate;
- u32 ccid3or_receive_rate;
-};
-
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
- TFRC_SSTATE_TERM,
};
-/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
- *
- * @ccid3hctx_x - Current sending rate in 64 * bytes per second
- * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second
- * @ccid3hctx_x_calc - Calculated rate in bytes per second
- * @ccid3hctx_rtt - Estimate of current round trip time in usecs
- * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
- * @ccid3hctx_s - Packet size in bytes
- * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
- * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
- * @ccid3hctx_last_win_count - Last window counter sent
- * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
- * with last_win_count value sent
- * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
- * @ccid3hctx_t_ld - Time last doubled during slow start
- * @ccid3hctx_t_nom - Nominal send time of next packet
- * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
- * @ccid3hctx_hist - Packet history
- * @ccid3hctx_options_received - Parsed set of retrieved options
+/**
+ * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
+ * @tx_x: Current sending rate in 64 * bytes per second
+ * @tx_x_recv: Receive rate in 64 * bytes per second
+ * @tx_x_calc: Calculated rate in bytes per second
+ * @tx_rtt: Estimate of current round trip time in usecs
+ * @tx_p: Current loss event rate (0-1) scaled by 1000000
+ * @tx_s: Packet size in bytes
+ * @tx_t_rto: Nofeedback Timer setting in usecs
+ * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @tx_state: Sender state, one of %ccid3_hc_tx_states
+ * @tx_last_win_count: Last window counter sent
+ * @tx_t_last_win_count: Timestamp of earliest packet
+ * with last_win_count value sent
+ * @tx_no_feedback_timer: Handle to no feedback timer
+ * @tx_t_ld: Time last doubled during slow start
+ * @tx_t_nom: Nominal send time of next packet
+ * @tx_hist: Packet history
*/
struct ccid3_hc_tx_sock {
- struct tfrc_tx_info ccid3hctx_tfrc;
-#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
-#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
-#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
-#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
-#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
-#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
-#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
- u16 ccid3hctx_s;
- enum ccid3_hc_tx_states ccid3hctx_state:8;
- u8 ccid3hctx_last_win_count;
- ktime_t ccid3hctx_t_last_win_count;
- struct timer_list ccid3hctx_no_feedback_timer;
- ktime_t ccid3hctx_t_ld;
- ktime_t ccid3hctx_t_nom;
- u32 ccid3hctx_delta;
- struct tfrc_tx_hist_entry *ccid3hctx_hist;
- struct ccid3_options_received ccid3hctx_options_received;
+ u64 tx_x;
+ u64 tx_x_recv;
+ u32 tx_x_calc;
+ u32 tx_rtt;
+ u32 tx_p;
+ u32 tx_t_rto;
+ u32 tx_t_ipi;
+ u16 tx_s;
+ enum ccid3_hc_tx_states tx_state:8;
+ u8 tx_last_win_count;
+ ktime_t tx_t_last_win_count;
+ struct timer_list tx_no_feedback_timer;
+ ktime_t tx_t_ld;
+ ktime_t tx_t_nom;
+ struct tfrc_tx_hist_entry *tx_hist;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
{
- struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
- BUG_ON(hctx == NULL);
- return hctx;
+ struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+ BUG_ON(hctx == NULL);
+ return hctx;
}
/* TFRC receiver states */
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
- TFRC_RSTATE_TERM = 127,
};
-/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- *
- * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
- * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
- * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4)
- * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1)
- * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states
- * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
- * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
- * @ccid3hcrx_rtt - Receiver estimate of RTT
- * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
- * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
- * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
- * @ccid3hcrx_li_hist - Loss Interval database
- * @ccid3hcrx_s - Received packet size in bytes
- * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+/**
+ * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
+ * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
+ * @rx_state: Receiver state, one of %ccid3_hc_rx_states
+ * @rx_bytes_recv: Total sum of DCCP payload bytes
+ * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ * @rx_rtt: Receiver estimate of RTT
+ * @rx_tstamp_last_feedback: Time at which last feedback was sent
+ * @rx_hist: Packet history (loss detection + RTT sampling)
+ * @rx_li_hist: Loss Interval database
+ * @rx_s: Received packet size in bytes
+ * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
*/
struct ccid3_hc_rx_sock {
- u8 ccid3hcrx_last_counter:4;
- enum ccid3_hc_rx_states ccid3hcrx_state:8;
- u32 ccid3hcrx_bytes_recv;
- u32 ccid3hcrx_x_recv;
- u32 ccid3hcrx_rtt;
- ktime_t ccid3hcrx_tstamp_last_feedback;
- struct tfrc_rx_hist ccid3hcrx_hist;
- struct tfrc_loss_hist ccid3hcrx_li_hist;
- u16 ccid3hcrx_s;
-#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
+ u8 rx_last_counter:4;
+ enum ccid3_hc_rx_states rx_state:8;
+ u32 rx_bytes_recv;
+ u32 rx_x_recv;
+ u32 rx_rtt;
+ ktime_t rx_tstamp_last_feedback;
+ struct tfrc_rx_hist rx_hist;
+ struct tfrc_loss_hist rx_li_hist;
+ u16 rx_s;
+#define rx_pinv rx_li_hist.i_mean
};
static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
- BUG_ON(hcrx == NULL);
- return hcrx;
+ struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+ BUG_ON(hcrx == NULL);
+ return hcrx;
}
#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
deleted file mode 100644
index 68c93e3d89d..00000000000
--- a/net/dccp/ccids/lib/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
-
-dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 849e181e698..57f9fd78c4d 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/lib/loss_interval.c
- *
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
@@ -21,7 +19,7 @@ static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
/* implements LIFO semantics on the array */
static inline u8 LIH_INDEX(const u8 ctr)
{
- return (LIH_SIZE - 1 - (ctr % LIH_SIZE));
+ return LIH_SIZE - 1 - (ctr % LIH_SIZE);
}
/* the `counter' index always points at the next entry to be populated */
@@ -60,14 +58,16 @@ void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
lh->ring[LIH_INDEX(lh->counter)] = NULL;
}
}
-EXPORT_SYMBOL_GPL(tfrc_lh_cleanup);
static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
{
u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
- for (i=0; i <= k; i++) {
+ if (k <= 0)
+ return;
+
+ for (i = 0; i <= k; i++) {
i_i = tfrc_lh_get_interval(lh, i);
if (i < k) {
@@ -78,7 +78,6 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
i_tot1 += i_i * tfrc_lh_weights[i-1];
}
- BUG_ON(w_tot == 0);
lh->i_mean = max(i_tot0, i_tot1) / w_tot;
}
@@ -90,14 +89,14 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
{
struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
u32 old_i_mean = lh->i_mean;
- s64 length;
+ s64 len;
if (cur == NULL) /* not initialised */
return 0;
- length = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq);
+ len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
- if (length - cur->li_length <= 0) /* duplicate or reordered */
+ if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
return 0;
if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
@@ -114,12 +113,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
return 0;
- cur->li_length = length;
+ cur->li_length = len;
tfrc_lh_calc_i_mean(lh);
- return (lh->i_mean < old_i_mean);
+ return lh->i_mean < old_i_mean;
}
-EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -129,11 +127,13 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
(cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
}
-/** tfrc_lh_interval_add - Insert new record into the Loss Interval database
+/**
+ * tfrc_lh_interval_add - Insert new record into the Loss Interval database
* @lh: Loss Interval database
* @rh: Receive history containing a fresh loss event
* @calc_first_li: Caller-dependent routine to compute length of first interval
* @sk: Used by @calc_first_li in caller-specific way (subtyping)
+ *
* Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
*/
int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
@@ -159,7 +159,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
else {
cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
new->li_length = dccp_delta_seqno(new->li_seqno,
- tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno);
+ tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
if (lh->counter > (2*LIH_SIZE))
lh->counter -= LIH_SIZE;
@@ -167,7 +167,6 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
}
return 1;
}
-EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
int __init tfrc_li_init(void)
{
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 246018a3b26..57f631a86cc 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -1,8 +1,6 @@
#ifndef _DCCP_LI_HIST_
#define _DCCP_LI_HIST_
/*
- * net/dccp/ccids/lib/loss_interval.h
- *
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
@@ -67,9 +65,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
struct tfrc_rx_hist;
-extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
- u32 (*first_li)(struct sock *), struct sock *);
-extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
-extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
+int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+ u32 (*first_li)(struct sock *), struct sock *);
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 20af1a69342..08df7a3acb3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -1,6 +1,4 @@
/*
- * net/dccp/packet_history.c
- *
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
*
@@ -40,18 +38,6 @@
#include "packet_history.h"
#include "../../dccp.h"
-/**
- * tfrc_tx_hist_entry - Simple singly-linked TX history list
- * @next: next oldest entry (LIFO order)
- * @seqno: sequence number of this entry
- * @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
- struct tfrc_tx_hist_entry *next;
- u64 seqno;
- ktime_t stamp;
-};
-
/*
* Transmitter History Routines
*/
@@ -73,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
}
}
-static struct tfrc_tx_hist_entry *
- tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
- while (head != NULL && head->seqno != seqno)
- head = head->next;
-
- return head;
-}
-
int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
{
struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -94,7 +71,6 @@ int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
*headp = entry;
return 0;
}
-EXPORT_SYMBOL_GPL(tfrc_tx_hist_add);
void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
{
@@ -109,29 +85,9 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
*headp = NULL;
}
-EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
-
-u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
- const ktime_t now)
-{
- u32 rtt = 0;
- struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
-
- if (packet != NULL) {
- rtt = ktime_us_delta(now, packet->stamp);
- /*
- * Garbage-collect older (irrelevant) entries:
- */
- tfrc_tx_hist_purge(&packet->next);
- }
-
- return rtt;
-}
-EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
-
/*
- * Receiver History Routines
+ * Receiver History Routines
*/
static struct kmem_cache *tfrc_rx_hist_slab;
@@ -153,7 +109,7 @@ void tfrc_rx_packet_history_exit(void)
static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
const struct sk_buff *skb,
- const u32 ndp)
+ const u64 ndp)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -166,13 +122,12 @@ static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
const struct sk_buff *skb,
- const u32 ndp)
+ const u64 ndp)
{
struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet);
/* has the packet contained in skb been seen before? */
int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
@@ -189,7 +144,6 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
return 0;
}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
{
@@ -206,31 +160,39 @@ static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
*
* In the descriptions, `Si' refers to the sequence number of entry number i,
* whose NDP count is `Ni' (lower case is used for variables).
- * Note: All __after_loss functions expect that a test against duplicates has
- * been performed already: the seqno of the skb must not be less than the
- * seqno of loss_prev; and it must not equal that of any valid hist_entry.
+ * Note: All __xxx_loss functions expect that a test against duplicates has been
+ * performed already: the seqno of the skb must not be less than the seqno
+ * of loss_prev; and it must not equal that of any valid history entry.
*/
+static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
+{
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
+ h->loss_count = 1;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
+ }
+}
+
static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
{
u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
s2 = DCCP_SKB_CB(skb)->dccpd_seq;
- int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp,
- d12 = dccp_delta_seqno(s1, s2), d2;
- if (d12 > 0) { /* S1 < S2 */
+ if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
h->loss_count = 2;
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
return;
}
/* S0 < S2 < S1 */
- d2 = dccp_delta_seqno(s0, s2);
- if (d2 == 1 || n2 >= d2) { /* S2 is direct successor of S0 */
- int d21 = -d12;
+ if (dccp_loss_free(s0, s2, n2)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
- if (d21 == 1 || n1 >= d21) {
+ if (dccp_loss_free(s2, s1, n1)) {
/* hole is filled: S0, S2, and S1 are consecutive */
h->loss_count = 0;
h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -238,9 +200,9 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
/* gap between S2 and S1: just update loss_prev */
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
- } else { /* hole between S0 and S2 */
+ } else { /* gap between S0 and S2 */
/*
- * Reorder history to insert S2 between S0 and s1
+ * Reorder history to insert S2 between S0 and S1
*/
tfrc_rx_hist_swap(h, 0, 3);
h->loss_start = tfrc_rx_hist_index(h, 3);
@@ -256,22 +218,18 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
s3 = DCCP_SKB_CB(skb)->dccpd_seq;
- int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp,
- d23 = dccp_delta_seqno(s2, s3), d13, d3, d31;
- if (d23 > 0) { /* S2 < S3 */
+ if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
h->loss_count = 3;
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
return 1;
}
/* S3 < S2 */
- d13 = dccp_delta_seqno(s1, s3);
- if (d13 > 0) {
+ if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
/*
- * The sequence number order is S1, S3, S2
- * Reorder history to insert entry between S1 and S2
+ * Reorder history to insert S3 between S1 and S2
*/
tfrc_rx_hist_swap(h, 2, 3);
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
@@ -280,17 +238,15 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
}
/* S0 < S3 < S1 */
- d31 = -d13;
- d3 = dccp_delta_seqno(s0, s3);
- if (d3 == 1 || n3 >= d3) { /* S3 is a successor of S0 */
+ if (dccp_loss_free(s0, s3, n3)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
- if (d31 == 1 || n1 >= d31) {
+ if (dccp_loss_free(s3, s1, n1)) {
/* hole between S0 and S1 filled by S3 */
- int d2 = dccp_delta_seqno(s1, s2),
- n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
- if (d2 == 1 || n2 >= d2) {
+ if (dccp_loss_free(s1, s2, n2)) {
/* entire hole filled by S0, S3, S1, S2 */
h->loss_start = tfrc_rx_hist_index(h, 2);
h->loss_count = 0;
@@ -307,8 +263,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
}
/*
- * The remaining case: S3 is not a successor of S0.
- * Sequence order is S0, S3, S1, S2; reorder to insert between S0 and S1
+ * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
+ * Reorder history to insert S3 between S0 and S1.
*/
tfrc_rx_hist_swap(h, 0, 3);
h->loss_start = tfrc_rx_hist_index(h, 3);
@@ -318,33 +274,25 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
return 1;
}
-/* return the signed modulo-2^48 sequence number distance from entry e1 to e2 */
-static s64 tfrc_rx_hist_delta_seqno(struct tfrc_rx_hist *h, u8 e1, u8 e2)
-{
- DCCP_BUG_ON(e1 > h->loss_count || e2 > h->loss_count);
-
- return dccp_delta_seqno(tfrc_rx_hist_entry(h, e1)->tfrchrx_seqno,
- tfrc_rx_hist_entry(h, e2)->tfrchrx_seqno);
-}
-
/* recycle RX history records to continue loss detection if necessary */
static void __three_after_loss(struct tfrc_rx_hist *h)
{
/*
- * The distance between S0 and S1 is always greater than 1 and the NDP
- * count of S1 is smaller than this distance. Otherwise there would
- * have been no loss. Hence it is only necessary to see whether there
- * are further missing data packets between S1/S2 and S2/S3.
+ * At this stage we know already that there is a gap between S0 and S1
+ * (since S0 was the highest sequence number received before detecting
+ * the loss). To recycle the loss record, it is thus only necessary to
+ * check for other possible gaps between S1/S2 and between S2/S3.
*/
- int d2 = tfrc_rx_hist_delta_seqno(h, 1, 2),
- d3 = tfrc_rx_hist_delta_seqno(h, 2, 3),
- n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
+ u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+ s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
- if (d2 == 1 || n2 >= d2) { /* S2 is successor to S1 */
+ if (dccp_loss_free(s1, s2, n2)) {
- if (d3 == 1 || n3 >= d3) {
- /* S3 is successor of S2: entire hole is filled */
+ if (dccp_loss_free(s2, s3, n3)) {
+ /* no gap between S2 and S3: entire hole is filled */
h->loss_start = tfrc_rx_hist_index(h, 3);
h->loss_count = 0;
} else {
@@ -353,7 +301,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
h->loss_count = 1;
}
- } else { /* gap between S1 and S2 */
+ } else { /* gap between S1 and S2 */
h->loss_start = tfrc_rx_hist_index(h, 1);
h->loss_count = 2;
}
@@ -367,18 +315,24 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
* @ndp: The NDP count belonging to @skb
* @calc_first_li: Caller-dependent computation of first loss interval in @lh
* @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
+ *
* Chooses action according to pending loss, updates LI database when a new
* loss was detected, and does required post-processing. Returns 1 when caller
* should send feedback, 0 otherwise.
+ * Since it also takes care of reordering during loss detection and updates the
+ * records accordingly, the caller should not perform any more RX history
+ * operations when loss_count is greater than 0 after calling this function.
*/
int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
struct tfrc_loss_hist *lh,
- struct sk_buff *skb, u32 ndp,
+ struct sk_buff *skb, const u64 ndp,
u32 (*calc_first_li)(struct sock *), struct sock *sk)
{
int is_new_loss = 0;
- if (h->loss_count == 1) {
+ if (h->loss_count == 0) {
+ __do_track_loss(h, skb, ndp);
+ } else if (h->loss_count == 1) {
__one_after_loss(h, skb, ndp);
} else if (h->loss_count != 2) {
DCCP_BUG("invalid loss_count %d", h->loss_count);
@@ -391,7 +345,6 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
}
return is_new_loss;
}
-EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
{
@@ -413,7 +366,6 @@ out_free:
}
return -ENOBUFS;
}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
{
@@ -425,7 +377,6 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
h->ring[i] = NULL;
}
}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
/**
* tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
@@ -437,7 +388,7 @@ static inline struct tfrc_rx_hist_entry *
}
/**
- * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
@@ -496,4 +447,3 @@ keep_ref_for_next_time:
return sample;
}
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index c7eeda49cb2..ee362b0b630 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
#include <linux/slab.h>
#include "tfrc.h"
-struct tfrc_tx_hist_entry;
+/**
+ * tfrc_tx_hist_entry - Simple singly-linked TX history list
+ * @next: next oldest entry (LIFO order)
+ * @seqno: sequence number of this entry
+ * @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+ struct tfrc_tx_hist_entry *next;
+ u64 seqno;
+ ktime_t stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+ tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+ while (head != NULL && head->seqno != seqno)
+ head = head->next;
+ return head;
+}
-extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
-extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
- const u64 seqno, const ktime_t now);
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
/* Subtraction a-b modulo-16, respects circular wrap-around */
#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -64,13 +80,12 @@ struct tfrc_rx_hist_entry {
u64 tfrchrx_seqno:48,
tfrchrx_ccval:4,
tfrchrx_type:4;
- u32 tfrchrx_ndp; /* In fact it is from 8 to 24 bits */
+ u64 tfrchrx_ndp:48;
ktime_t tfrchrx_tstamp;
};
/**
* tfrc_rx_hist - RX history structure for TFRC-based protocols
- *
* @ring: Packet history for RTT sampling and loss detection
* @loss_count: Number of entries in circular history
* @loss_start: Movable index (for loss detection)
@@ -118,46 +133,23 @@ static inline struct tfrc_rx_hist_entry *
return h->ring[h->loss_start];
}
-/* initialise loss detection and disable RTT sampling */
-static inline void tfrc_rx_hist_loss_indicated(struct tfrc_rx_hist *h)
-{
- h->loss_count = 1;
-}
-
/* indicate whether previously a packet was detected missing */
-static inline int tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
+static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
{
- return h->loss_count;
-}
-
-/* any data packets missing between last reception and skb ? */
-static inline int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h,
- const struct sk_buff *skb,
- u32 ndp)
-{
- int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)->tfrchrx_seqno,
- DCCP_SKB_CB(skb)->dccpd_seq);
-
- if (delta > 1 && ndp < delta)
- tfrc_rx_hist_loss_indicated(h);
-
- return tfrc_rx_hist_loss_pending(h);
+ return h->loss_count > 0;
}
-extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
- const struct sk_buff *skb, const u32 ndp);
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
+ const u64 ndp);
-extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
struct tfrc_loss_hist;
-extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
- struct tfrc_loss_hist *lh,
- struct sk_buff *skb, u32 ndp,
- u32 (*first_li)(struct sock *sk),
- struct sock *sk);
-extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
- const struct sk_buff *skb);
-extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
-extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*first_li)(struct sock *sk), struct sock *sk);
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
index d1dfbb8de64..62b5828acde 100644
--- a/net/dccp/ccids/lib/tfrc.c
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -1,28 +1,19 @@
/*
- * TFRC: main module holding the pieces of the TFRC library together
+ * TFRC library initialisation
*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
*/
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include "tfrc.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-int tfrc_debug;
-module_param(tfrc_debug, bool, 0444);
-MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
+bool tfrc_debug;
+module_param(tfrc_debug, bool, 0644);
+MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
#endif
-extern int tfrc_tx_packet_history_init(void);
-extern void tfrc_tx_packet_history_exit(void);
-extern int tfrc_rx_packet_history_init(void);
-extern void tfrc_rx_packet_history_exit(void);
-
-extern int tfrc_li_init(void);
-extern void tfrc_li_exit(void);
-
-static int __init tfrc_module_init(void)
+int __init tfrc_lib_init(void)
{
int rc = tfrc_li_init();
@@ -46,18 +37,9 @@ out:
return rc;
}
-static void __exit tfrc_module_exit(void)
+void tfrc_lib_exit(void)
{
tfrc_rx_packet_history_exit();
tfrc_tx_packet_history_exit();
tfrc_li_exit();
}
-
-module_init(tfrc_module_init);
-module_exit(tfrc_module_exit);
-
-MODULE_AUTHOR("Gerrit Renker <gerrit@erg.abdn.ac.uk>, "
- "Ian McDonald <ian.mcdonald@jandi.co.nz>, "
- "Arnaldo Carvalho de Melo <acme@redhat.com>");
-MODULE_DESCRIPTION("DCCP TFRC library");
-MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 1fb1187bbf1..40ee7d62b65 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -1,8 +1,6 @@
#ifndef _TFRC_H_
#define _TFRC_H_
/*
- * net/dccp/ccids/lib/tfrc.h
- *
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
@@ -15,35 +13,34 @@
* (at your option) any later version.
*/
#include <linux/types.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
#include "../../dccp.h"
-/* internal includes that this module exports: */
+
+/* internal includes that this library exports: */
#include "loss_interval.h"
#include "packet_history.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-extern int tfrc_debug;
+extern bool tfrc_debug;
#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
#else
#define tfrc_pr_debug(format, a...)
#endif
/* integer-arithmetic divisions of type (a * 1000000)/b */
-static inline u64 scaled_div(u64 a, u32 b)
+static inline u64 scaled_div(u64 a, u64 b)
{
- BUG_ON(b==0);
- a *= 1000000;
- do_div(a, b);
- return a;
+ BUG_ON(b == 0);
+ return div64_u64(a * 1000000, b);
}
-static inline u32 scaled_div32(u64 a, u32 b)
+static inline u32 scaled_div32(u64 a, u64 b)
{
u64 result = scaled_div(a, b);
if (result > UINT_MAX) {
- DCCP_CRIT("Overflow: a(%llu)/b(%u) > ~0U",
- (unsigned long long)a, b);
+ DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+ (unsigned long long)a, (unsigned long long)b);
return UINT_MAX;
}
return result;
@@ -58,7 +55,23 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
}
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
+
+int tfrc_tx_packet_history_init(void);
+void tfrc_tx_packet_history_exit(void);
+int tfrc_rx_packet_history_init(void);
+void tfrc_rx_packet_history_exit(void);
+int tfrc_li_init(void);
+void tfrc_li_exit(void);
+
+#ifdef CONFIG_IP_DCCP_TFRC_LIB
+int tfrc_lib_init(void);
+void tfrc_lib_exit(void);
+#else
+#define tfrc_lib_init() (0)
+#define tfrc_lib_exit()
+#endif
#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index e4e64b76c10..88ef98285be 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -1,6 +1,4 @@
/*
- * net/dccp/ccids/lib/tfrc_equation.c
- *
* Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
@@ -79,10 +77,10 @@
}
With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
- lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
- lookup[M][0] = g(1000000) = 1000000 * f(100%)
- lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
- lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
+ lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
+ lookup[M][0] = g(1000000) = 1000000 * f(100%)
+ lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
+ lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
In summary, the two columns represent f(p) for the following ranges:
* The first column is for 0.002 <= p <= 1.0
@@ -610,11 +608,11 @@ static inline u32 tfrc_binsearch(u32 fval, u8 small)
/**
* tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
+ * @s: packet size in bytes
+ * @R: RTT scaled by 1000000 (i.e., microseconds)
+ * @p: loss ratio estimate scaled by 1000000
*
- * @s: packet size in bytes
- * @R: RTT scaled by 1000000 (i.e., microseconds)
- * @p: loss ratio estimate scaled by 1000000
- * Returns X_calc in bytes per second (not scaled).
+ * Returns X_calc in bytes per second (not scaled).
*/
u32 tfrc_calc_x(u16 s, u32 R, u32 p)
{
@@ -630,17 +628,17 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
return ~0U;
}
- if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
+ if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
DCCP_WARN("Value of p (%d) below resolution. "
"Substituting %d\n", p, TFRC_SMALLEST_P);
index = 0;
- } else /* 0.0001 <= p <= 0.05 */
+ } else /* 0.0001 <= p <= 0.05 */
index = p/TFRC_SMALLEST_P - 1;
f = tfrc_calc_x_lookup[index][1];
- } else { /* 0.05 < p <= 1.00 */
+ } else { /* 0.05 < p <= 1.00 */
index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
f = tfrc_calc_x_lookup[index][0];
@@ -659,12 +657,10 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
return scaled_div32(result, f);
}
-EXPORT_SYMBOL_GPL(tfrc_calc_x);
-
-/*
+/**
* tfrc_calc_x_reverse_lookup - try to find p given f(p)
- *
* @fvalue: function value to match, scaled by 1000000
+ *
* Returns closest match for p, also scaled by 1000000
*/
u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
@@ -676,11 +672,11 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
/* Error cases. */
if (fvalue < tfrc_calc_x_lookup[0][1]) {
- DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
- return tfrc_calc_x_lookup[0][1];
+ DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+ return TFRC_SMALLEST_P;
}
if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
- DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
+ DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
return 1000000;
}
@@ -694,4 +690,16 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
-EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
+/**
+ * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+ if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
+ return 0;
+ if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
+ return 1000000;
+ return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 287a62bc2e0..c67816647cc 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -23,9 +23,9 @@
* DCCP - specific warning and debugging macros.
*/
#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt, \
- __FUNCTION__, ##a)
+ __func__, ##a)
#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
- __FILE__, __LINE__, __FUNCTION__)
+ __FILE__, __LINE__, __func__)
#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
DCCP_BUG("\"%s\" holds (exception!)", \
@@ -36,22 +36,24 @@
printk(fmt, ##args); \
} while(0)
#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
- "%s: " fmt, __FUNCTION__, ##a)
+ "%s: " fmt, __func__, ##a)
#ifdef CONFIG_IP_DCCP_DEBUG
-extern int dccp_debug;
+extern bool dccp_debug;
#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
+#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
#else
#define dccp_pr_debug(format, a...)
#define dccp_pr_debug_cat(format, a...)
+#define dccp_debug(format, a...)
#endif
extern struct inet_hashinfo dccp_hashinfo;
-extern atomic_t dccp_orphan_count;
+extern struct percpu_counter dccp_orphan_count;
-extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+void dccp_time_wait(struct sock *sk, int state, int timeo);
/*
* Set safe upper bounds for header and option length. Since Data Offset is 8
@@ -61,16 +63,19 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
* Hence a safe upper bound for the maximum option length is 1020-28 = 992
*/
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
#define DCCP_MAX_PACKET_HDR 28
#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
+
#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
/* RFC 1122, 4.2.3.1 initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
/*
* The maximum back-off value for retransmissions. This is needed for
@@ -79,7 +84,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* - feature-negotiation retransmission (sec. 6.6.3),
* - Acks in client-PARTOPEN state (sec. 8.1.5).
*/
-#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
+#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
/*
* RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
@@ -88,19 +93,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
-/* Maximal interval between probes for local resources. */
-#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
-
/* sysctl variables for DCCP */
extern int sysctl_dccp_request_retries;
extern int sysctl_dccp_retries1;
extern int sysctl_dccp_retries2;
-extern int sysctl_dccp_feat_sequence_window;
-extern int sysctl_dccp_feat_rx_ccid;
-extern int sysctl_dccp_feat_tx_ccid;
-extern int sysctl_dccp_feat_ack_ratio;
-extern int sysctl_dccp_feat_send_ack_vector;
-extern int sysctl_dccp_feat_send_ndp_count;
extern int sysctl_dccp_tx_qlen;
extern int sysctl_dccp_sync_ratelimit;
@@ -153,6 +149,30 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
return after48(seq1, seq2) ? seq1 : seq2;
}
+/**
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1: last known sequence number before the loss ('hole')
+ * @s2: first sequence number seen after the 'hole'
+ * @ndp: NDP count on packet with sequence number @s2
+ */
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
+{
+ s64 delta = dccp_delta_seqno(s1, s2);
+
+ WARN_ON(delta < 0);
+ delta -= ndp + 1;
+
+ return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+ return dccp_loss_count(s1, s2, ndp) == 0;
+}
+
enum {
DCCP_MIB_NUM = 0,
DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
@@ -175,17 +195,12 @@ enum {
#define DCCP_MIB_MAX __DCCP_MIB_MAX
struct dccp_mib {
unsigned long mibs[DCCP_MIB_MAX];
-} __SNMP_MIB_ALIGN__;
+};
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-#define DCCP_ADD_STATS_BH(field, val) \
- SNMP_ADD_STATS_BH(dccp_statistics, field, val)
-#define DCCP_ADD_STATS_USER(field, val) \
- SNMP_ADD_STATS_USER(dccp_statistics, field, val)
/*
* Checksumming routines
@@ -209,99 +224,108 @@ static inline void dccp_csum_outgoing(struct sk_buff *skb)
skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
}
-extern void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
-extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+int dccp_retransmit_skb(struct sock *sk);
-extern void dccp_send_ack(struct sock *sk);
-extern void dccp_reqsk_send_ack(struct sk_buff *sk, struct request_sock *rsk);
+void dccp_send_ack(struct sock *sk);
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk);
-extern void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
-extern void dccp_write_xmit(struct sock *sk, int block);
-extern void dccp_write_space(struct sock *sk);
+/*
+ * TX Packet Dequeueing Interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+bool dccp_qpolicy_full(struct sock *sk);
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *dccp_qpolicy_top(struct sock *sk);
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
-extern void dccp_init_xmit_timers(struct sock *sk);
+/*
+ * TX Packet Output and TX Timers
+ */
+void dccp_write_xmit(struct sock *sk);
+void dccp_write_space(struct sock *sk);
+void dccp_flush_write_queue(struct sock *sk, long *time_budget);
+
+void dccp_init_xmit_timers(struct sock *sk);
static inline void dccp_clear_xmit_timers(struct sock *sk)
{
inet_csk_clear_xmit_timers(sk);
}
-extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
+const char *dccp_packet_name(const int type);
-extern void dccp_set_state(struct sock *sk, const int state);
-extern void dccp_done(struct sock *sk);
+void dccp_set_state(struct sock *sk, const int state);
+void dccp_done(struct sock *sk);
-extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
+int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+ struct sk_buff const *skb);
-extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-extern struct sock *dccp_create_openreq_child(struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
-extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst);
-extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev);
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev);
-extern int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len);
-extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len);
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned int len);
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len);
-extern int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
-extern int dccp_destroy_sock(struct sock *sk);
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
+void dccp_destroy_sock(struct sock *sk);
-extern void dccp_close(struct sock *sk, long timeout);
-extern struct sk_buff *dccp_make_response(struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req);
+void dccp_close(struct sock *sk, long timeout);
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req);
-extern int dccp_connect(struct sock *sk);
-extern int dccp_disconnect(struct sock *sk, int flags);
-extern int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-extern int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
+int dccp_connect(struct sock *sk);
+int dccp_disconnect(struct sock *sk, int flags);
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
#ifdef CONFIG_COMPAT
-extern int compat_dccp_getsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, int __user *optlen);
-extern int compat_dccp_setsockopt(struct sock *sk,
- int level, int optname,
- char __user *optval, int optlen);
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
#endif
-extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size);
-extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len);
-extern void dccp_shutdown(struct sock *sk, int how);
-extern int inet_dccp_listen(struct socket *sock, int backlog);
-extern unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len);
-
-extern struct sk_buff *dccp_ctl_make_reset(struct socket *ctl,
- struct sk_buff *skb);
-extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
-extern void dccp_send_close(struct sock *sk, const int active);
-extern int dccp_invalid_packet(struct sk_buff *skb);
-extern u32 dccp_sample_rtt(struct sock *sk, long delta);
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t size);
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t len, int nonblock, int flags,
+ int *addr_len);
+void dccp_shutdown(struct sock *sk, int how);
+int inet_dccp_listen(struct socket *sock, int backlog);
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
+void dccp_send_close(struct sock *sk, const int active);
+int dccp_invalid_packet(struct sk_buff *skb);
+u32 dccp_sample_rtt(struct sock *sk, long delta);
static inline int dccp_bad_service_code(const struct sock *sk,
const __be32 service)
@@ -322,9 +346,16 @@ static inline int dccp_bad_service_code(const struct sock *sk,
* @dccpd_opt_len: total length of all options (5.8) in the packet
* @dccpd_seq: sequence number
* @dccpd_ack_seq: acknowledgment number subheader field value
+ *
* This is used for transmission as well as for reception.
*/
struct dccp_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
__u8 dccpd_type:4;
__u8 dccpd_ccval:4;
__u8 dccpd_reset_code,
@@ -389,52 +420,73 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
static inline void dccp_update_gsr(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_minisock *dmsk = dccp_msk(sk);
- dp->dccps_gsr = seq;
- dccp_set_seqno(&dp->dccps_swl,
- dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
- dccp_set_seqno(&dp->dccps_swh,
- dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
+ if (after48(seq, dp->dccps_gsr))
+ dp->dccps_gsr = seq;
+ /* Sequence validity window depends on remote Sequence Window (7.5.1) */
+ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+ /*
+ * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+ * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+ * always > 32, so for the first W/W' packets in the lifetime of a
+ * connection we always have to adjust SWL.
+ * A second reason why we are doing this is that the window depends on
+ * the feature-remote value of Sequence Window: nothing stops the peer
+ * from updating this value while we are busy adjusting SWL for the
+ * first W packets (we would have to count from scratch again then).
+ * Therefore it is safer to always make sure that the Sequence Window
+ * is not artificially extended by a peer who grows SWL downwards by
+ * continually updating the feature-remote Sequence-Window.
+ * If sequence numbers wrap it is bad luck. But that will take a while
+ * (48 bit), and this measure prevents Sequence-number attacks.
+ */
+ if (before48(dp->dccps_swl, dp->dccps_isr))
+ dp->dccps_swl = dp->dccps_isr;
+ dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
static inline void dccp_update_gss(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
- dp->dccps_awh = dp->dccps_gss = seq;
- dccp_set_seqno(&dp->dccps_awl,
- (dp->dccps_gss -
- dccp_msk(sk)->dccpms_sequence_window + 1));
+ dp->dccps_gss = seq;
+ /* Ack validity window depends on local Sequence Window value (7.5.1) */
+ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+ /* Adjust AWL so that it is not below ISS - see comment above for SWL */
+ if (before48(dp->dccps_awl, dp->dccps_iss))
+ dp->dccps_awl = dp->dccps_iss;
+ dp->dccps_awh = dp->dccps_gss;
+}
+
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+ !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
}
static inline int dccp_ack_pending(const struct sock *sk)
{
- const struct dccp_sock *dp = dccp_sk(sk);
- return dp->dccps_timestamp_echo != 0 ||
-#ifdef CONFIG_IP_DCCP_ACKVEC
- (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
-#endif
- inet_csk_ack_scheduled(sk);
+ return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
-extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
-extern int dccp_insert_option_elapsed_time(struct sock *sk,
- struct sk_buff *skb,
- u32 elapsed_time);
-extern u32 dccp_timestamp(void);
-extern void dccp_timestamping_init(void);
-extern int dccp_insert_option_timestamp(struct sock *sk,
- struct sk_buff *skb);
-extern int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- unsigned char option,
- const void *value, unsigned char len);
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
+int dccp_feat_finalise_settings(struct dccp_sock *dp);
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+ struct sk_buff *skb);
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
+void dccp_feat_list_purge(struct list_head *fn_list);
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
+u32 dccp_timestamp(void);
+void dccp_timestamping_init(void);
+int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+ const void *value, unsigned char len);
#ifdef CONFIG_SYSCTL
-extern int dccp_sysctl_init(void);
-extern void dccp_sysctl_exit(void);
+int dccp_sysctl_init(void);
+void dccp_sysctl_exit(void);
#else
static inline int dccp_sysctl_init(void)
{
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index d8a3509b26f..028fc43aacb 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,11 +29,14 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_backoff = icsk->icsk_backoff;
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
- if (dccp_msk(sk)->dccpms_send_ack_vector)
+ if (dp->dccps_hc_rx_ackvec != NULL)
info->tcpi_options |= TCPI_OPT_SACK;
- ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
- ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+ if (dp->dccps_hc_rx_ccid != NULL)
+ ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+
+ if (dp->dccps_hc_tx_ccid != NULL)
+ ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
}
static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -45,11 +48,23 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
dccp_get_info(sk, _info);
}
-static struct inet_diag_handler dccp_diag_handler = {
- .idiag_hashinfo = &dccp_hashinfo,
+static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+}
+
+static int dccp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler dccp_diag_handler = {
+ .dump = dccp_diag_dump,
+ .dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
- .idiag_type = DCCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
+ .idiag_type = IPPROTO_DCCP,
};
static int __init dccp_diag_init(void)
@@ -68,4 +83,4 @@ module_exit(dccp_diag_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
MODULE_DESCRIPTION("DCCP inet_diag handler");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, DCCPDIAG_GETSOCK);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 4a4f6ce4498..9733ddbc96c 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,11 +1,18 @@
/*
* net/dccp/feat.c
*
- * An implementation of the DCCP protocol
- * Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ * Rewrote from scratch, some bits from earlier code by
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
*
* ASSUMPTIONS
* -----------
+ * o Feature negotiation is coordinated with connection setup (as in TCP), wild
+ * changes of parameters of an established connection are not supported.
+ * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
* o All currently known SP features have 1-byte quantities. If in the future
* extensions of RFCs 4340..42 define features with item lengths larger than
* one byte, a feature-specific extension of the code will be required.
@@ -15,635 +22,1540 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
-
#include <linux/module.h>
-
+#include <linux/slab.h>
#include "ccid.h"
#include "feat.h"
-#define DCCP_FEAT_SP_NOAGREE (-123)
-
-int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
- u8 *val, u8 len, gfp_t gfp)
-{
- struct dccp_opt_pend *opt;
-
- dccp_feat_debug(type, feature, *val);
-
- if (len > 3) {
- DCCP_WARN("invalid length %d\n", len);
- return 1;
- }
- /* XXX add further sanity checks */
-
- /* check if that feature is already being negotiated */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- /* ok we found a negotiation for this option already */
- if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
- dccp_pr_debug("Replacing old\n");
- /* replace */
- BUG_ON(opt->dccpop_val == NULL);
- kfree(opt->dccpop_val);
- opt->dccpop_val = val;
- opt->dccpop_len = len;
- opt->dccpop_conf = 0;
- return 0;
- }
- }
-
- /* negotiation for a new feature */
- opt = kmalloc(sizeof(*opt), gfp);
- if (opt == NULL)
- return -ENOMEM;
-
- opt->dccpop_type = type;
- opt->dccpop_feat = feature;
- opt->dccpop_len = len;
- opt->dccpop_val = val;
- opt->dccpop_conf = 0;
- opt->dccpop_sc = NULL;
-
- BUG_ON(opt->dccpop_val == NULL);
-
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
- return 0;
-}
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
+int sysctl_dccp_rx_ccid __read_mostly = 2,
+ sysctl_dccp_tx_ccid __read_mostly = 2;
-EXPORT_SYMBOL_GPL(dccp_feat_change);
-
-static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
+/*
+ * Feature activation handlers.
+ *
+ * These all use an u64 argument, to provide enough room for NN/SP features. At
+ * this stage the negotiated values have been checked to be within their range.
+ */
+static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
- /* figure out if we are changing our CCID or the peer's */
- const int rx = type == DCCPO_CHANGE_R;
- const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
- struct ccid *new_ccid;
-
- /* Check if nothing is being changed. */
- if (ccid_nr == new_ccid_nr)
- return 0;
+ struct ccid *new_ccid = ccid_new(ccid, sk, rx);
- new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
if (new_ccid == NULL)
return -ENOMEM;
if (rx) {
ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
dp->dccps_hc_rx_ccid = new_ccid;
- dmsk->dccpms_rx_ccid = new_ccid_nr;
} else {
ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
dp->dccps_hc_tx_ccid = new_ccid;
- dmsk->dccpms_tx_ccid = new_ccid_nr;
}
+ return 0;
+}
+
+static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx) {
+ dp->dccps_r_seq_win = seq_win;
+ /* propagate changes to update SWL/SWH */
+ dccp_update_gsr(sk, dp->dccps_gsr);
+ } else {
+ dp->dccps_l_seq_win = seq_win;
+ /* propagate changes to update AWL */
+ dccp_update_gss(sk, dp->dccps_gss);
+ }
+ return 0;
+}
+static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
+{
+ if (rx)
+ dccp_sk(sk)->dccps_r_ack_ratio = ratio;
+ else
+ dccp_sk(sk)->dccps_l_ack_ratio = ratio;
return 0;
}
-static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
+static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
{
- dccp_feat_debug(type, feat, val);
+ struct dccp_sock *dp = dccp_sk(sk);
- switch (feat) {
- case DCCPF_CCID:
- return dccp_feat_update_ccid(sk, type, val);
- default:
- dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
- dccp_feat_typename(type), feat);
- break;
+ if (rx) {
+ if (enable && dp->dccps_hc_rx_ackvec == NULL) {
+ dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
+ if (dp->dccps_hc_rx_ackvec == NULL)
+ return -ENOMEM;
+ } else if (!enable) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
}
return 0;
}
-static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
- u8 *rpref, u8 rlen)
+static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
+{
+ if (!rx)
+ dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
+ return 0;
+}
+
+/*
+ * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
+ * `rx' holds when the sending peer informs about his partial coverage via a
+ * ChangeR() option. In the other case, we are the sender and the receiver
+ * announces its coverage via ChangeL() options. The policy here is to honour
+ * such communication by enabling the corresponding partial coverage - but only
+ * if it has not been set manually before; the warning here means that all
+ * packets will be dropped.
+ */
+static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
{
struct dccp_sock *dp = dccp_sk(sk);
- u8 *spref, slen, *res = NULL;
- int i, j, rc, agree = 1;
- BUG_ON(rpref == NULL);
+ if (rx)
+ dp->dccps_pcrlen = cscov;
+ else {
+ if (dp->dccps_pcslen == 0)
+ dp->dccps_pcslen = cscov;
+ else if (cscov > dp->dccps_pcslen)
+ DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
+ dp->dccps_pcslen, (u8)cscov);
+ }
+ return 0;
+}
+
+static const struct {
+ u8 feat_num; /* DCCPF_xxx */
+ enum dccp_feat_type rxtx; /* RX or TX */
+ enum dccp_feat_type reconciliation; /* SP or NN */
+ u8 default_value; /* as in 6.4 */
+ int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
+/*
+ * Lookup table for location and type of features (from RFC 4340/4342)
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | Feature | Location | Reconc. | Initial | Section |
+ * | | RX | TX | SP | NN | Value | Reference |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | DCCPF_CCID | | X | X | | 2 | 10 |
+ * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
+ * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
+ * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
+ * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
+ * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
+ * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
+ * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
+ * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
+ * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+ { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
+ { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
+ { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
+ { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
+ { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
+ { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
+ { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
+ { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+};
+#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
+
+/**
+ * dccp_feat_index - Hash function to map feature number into array position
+ * Returns consecutive array index or -1 if the feature is not understood.
+ */
+static int dccp_feat_index(u8 feat_num)
+{
+ /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
+ if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
+ return feat_num - 1;
- /* check if we are the black sheep */
- if (dp->dccps_role == DCCP_ROLE_CLIENT) {
- spref = rpref;
- slen = rlen;
- rpref = opt->dccpop_val;
- rlen = opt->dccpop_len;
- } else {
- spref = opt->dccpop_val;
- slen = opt->dccpop_len;
+ /*
+ * Other features: add cases for new feature types here after adding
+ * them to the above table.
+ */
+ switch (feat_num) {
+ case DCCPF_SEND_LEV_RATE:
+ return DCCP_FEAT_SUPPORTED_MAX - 1;
}
+ return -1;
+}
+
+static u8 dccp_feat_type(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
+
+ if (idx < 0)
+ return FEAT_UNKNOWN;
+ return dccp_feat_table[idx].reconciliation;
+}
+
+static int dccp_feat_default_value(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
/*
- * Now we have server preference list in spref and client preference in
- * rpref
+ * There are no default values for unknown features, so encountering a
+ * negative index here indicates a serious problem somewhere else.
*/
- BUG_ON(spref == NULL);
- BUG_ON(rpref == NULL);
+ DCCP_BUG_ON(idx < 0);
- /* FIXME sanity check vals */
+ return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
+}
- /* Are values in any order? XXX Lame "algorithm" here */
- for (i = 0; i < slen; i++) {
- for (j = 0; j < rlen; j++) {
- if (spref[i] == rpref[j]) {
- res = &spref[i];
- break;
- }
- }
- if (res)
- break;
- }
+/*
+ * Debugging and verbose-printing section
+ */
+static const char *dccp_feat_fname(const u8 feat)
+{
+ static const char *const feature_names[] = {
+ [DCCPF_RESERVED] = "Reserved",
+ [DCCPF_CCID] = "CCID",
+ [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
+ [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
+ [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
+ [DCCPF_ACK_RATIO] = "Ack Ratio",
+ [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
+ [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
+ [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
+ [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
+ };
+ if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+ return feature_names[DCCPF_RESERVED];
- /* we didn't agree on anything */
- if (res == NULL) {
- /* confirm previous value */
- switch (opt->dccpop_feat) {
- case DCCPF_CCID:
- /* XXX did i get this right? =P */
- if (opt->dccpop_type == DCCPO_CHANGE_L)
- res = &dccp_msk(sk)->dccpms_tx_ccid;
- else
- res = &dccp_msk(sk)->dccpms_rx_ccid;
- break;
+ if (feat == DCCPF_SEND_LEV_RATE)
+ return "Send Loss Event Rate";
+ if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+ return "CCID-specific";
- default:
- DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
- /* XXX implement res */
- return -EFAULT;
- }
+ return feature_names[feat];
+}
- dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
- agree = 0; /* this is used for mandatory options... */
+static const char *const dccp_feat_sname[] = {
+ "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
+};
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_feat_oname(const u8 opt)
+{
+ switch (opt) {
+ case DCCPO_CHANGE_L: return "Change_L";
+ case DCCPO_CONFIRM_L: return "Confirm_L";
+ case DCCPO_CHANGE_R: return "Change_R";
+ case DCCPO_CONFIRM_R: return "Confirm_R";
}
+ return NULL;
+}
- /* need to put result and our preference list */
- rlen = 1 + opt->dccpop_len;
- rpref = kmalloc(rlen, GFP_ATOMIC);
- if (rpref == NULL)
- return -ENOMEM;
+static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
+{
+ u8 i, type = dccp_feat_type(feat_num);
+
+ if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
+ dccp_pr_debug_cat("(NULL)");
+ else if (type == FEAT_SP)
+ for (i = 0; i < val->sp.len; i++)
+ dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
+ else if (type == FEAT_NN)
+ dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
+ else
+ dccp_pr_debug_cat("unknown type %u", type);
+}
- *rpref = *res;
- memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
+static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
+{
+ u8 type = dccp_feat_type(feat_num);
+ dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
+
+ if (type == FEAT_NN)
+ fval.nn = dccp_decode_value_var(list, len);
+ dccp_feat_printval(feat_num, &fval);
+}
- /* put it in the "confirm queue" */
- if (opt->dccpop_sc == NULL) {
- opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
- if (opt->dccpop_sc == NULL) {
- kfree(rpref);
- return -ENOMEM;
+static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
+{
+ dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
+ dccp_feat_fname(entry->feat_num));
+ dccp_feat_printval(entry->feat_num, &entry->val);
+ dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
+ entry->needs_confirm ? "(Confirm pending)" : "");
+}
+
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
+ dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
+ dccp_feat_printvals(feat, val, len); \
+ dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
+
+#define dccp_feat_print_fnlist(fn_list) { \
+ const struct dccp_feat_entry *___entry; \
+ \
+ dccp_pr_debug("List Dump:\n"); \
+ list_for_each_entry(___entry, fn_list, node) \
+ dccp_feat_print_entry(___entry); \
+}
+#else /* ! CONFIG_IP_DCCP_DEBUG */
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
+#define dccp_feat_print_fnlist(fn_list)
+#endif
+
+static int __dccp_feat_activate(struct sock *sk, const int idx,
+ const bool is_local, dccp_feat_val const *fval)
+{
+ bool rx;
+ u64 val;
+
+ if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
+ return -1;
+ if (dccp_feat_table[idx].activation_hdlr == NULL)
+ return 0;
+
+ if (fval == NULL) {
+ val = dccp_feat_table[idx].default_value;
+ } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
+ if (fval->sp.vec == NULL) {
+ /*
+ * This can happen when an empty Confirm is sent
+ * for an SP (i.e. known) feature. In this case
+ * we would be using the default anyway.
+ */
+ DCCP_CRIT("Feature #%d undefined: using default", idx);
+ val = dccp_feat_table[idx].default_value;
+ } else {
+ val = fval->sp.vec[0];
}
} else {
- /* recycle the confirm slot */
- BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
- kfree(opt->dccpop_sc->dccpoc_val);
- dccp_pr_debug("recycling confirm slot\n");
- }
- memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
-
- opt->dccpop_sc->dccpoc_val = rpref;
- opt->dccpop_sc->dccpoc_len = rlen;
-
- /* update the option on our side [we are about to send the confirm] */
- rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
- if (rc) {
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- opt->dccpop_sc = NULL;
- return rc;
+ val = fval->nn;
}
- dccp_pr_debug("Will confirm %d\n", *rpref);
+ /* Location is RX if this is a local-RX or remote-TX feature */
+ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
- /* say we want to change to X but we just got a confirm X, suppress our
- * change
- */
- if (!opt->dccpop_conf) {
- if (*opt->dccpop_val == *res)
- opt->dccpop_conf = 1;
- dccp_pr_debug("won't ask for change of same feature\n");
+ dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
+ dccp_feat_fname(dccp_feat_table[idx].feat_num),
+ fval ? "" : "default ", (unsigned long long)val);
+
+ return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
+}
+
+/**
+ * dccp_feat_activate - Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ *
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+ dccp_feat_val const *fval)
+{
+ return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
+/* Test for "Req'd" feature (RFC 4340, 6.4) */
+static inline int dccp_feat_must_be_understood(u8 feat_num)
+{
+ return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
+ feat_num == DCCPF_SEQUENCE_WINDOW;
+}
+
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+ fval->sp.len = len;
+ if (fval->sp.len > 0) {
+ fval->sp.vec = kmemdup(val, len, gfp_any());
+ if (fval->sp.vec == NULL) {
+ fval->sp.len = 0;
+ return -ENOBUFS;
+ }
}
+ return 0;
+}
- return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
+static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
+{
+ if (unlikely(val == NULL))
+ return;
+ if (dccp_feat_type(feat_num) == FEAT_SP)
+ kfree(val->sp.vec);
+ memset(val, 0, sizeof(*val));
}
-static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+static struct dccp_feat_entry *
+ dccp_feat_clone_entry(struct dccp_feat_entry const *original)
{
- struct dccp_minisock *dmsk = dccp_msk(sk);
- struct dccp_opt_pend *opt;
- int rc = 1;
- u8 t;
+ struct dccp_feat_entry *new;
+ u8 type = dccp_feat_type(original->feat_num);
- /*
- * We received a CHANGE. We gotta match it against our own preference
- * list. If we got a CHANGE_R it means it's a change for us, so we need
- * to compare our CHANGE_L list.
- */
- if (type == DCCPO_CHANGE_L)
- t = DCCPO_CHANGE_R;
- else
- t = DCCPO_CHANGE_L;
+ if (type == FEAT_UNKNOWN)
+ return NULL;
- /* find our preference list for this feature */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (opt->dccpop_type != t || opt->dccpop_feat != feature)
- continue;
+ new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+ if (new == NULL)
+ return NULL;
- /* find the winner from the two preference lists */
- rc = dccp_feat_reconcile(sk, opt, val, len);
- break;
+ if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+ original->val.sp.vec,
+ original->val.sp.len)) {
+ kfree(new);
+ return NULL;
}
+ return new;
+}
- /* We didn't deal with the change. This can happen if we have no
- * preference list for the feature. In fact, it just shouldn't
- * happen---if we understand a feature, we should have a preference list
- * with at least the default value.
- */
- BUG_ON(rc == 1);
+static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
+{
+ if (entry != NULL) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ kfree(entry);
+ }
+}
- return rc;
+/*
+ * List management functions
+ *
+ * Feature negotiation lists rely on and maintain the following invariants:
+ * - each feat_num in the list is known, i.e. we know its type and default value
+ * - each feat_num/is_local combination is unique (old entries are overwritten)
+ * - SP values are always freshly allocated
+ * - list is sorted in increasing order of feature number (faster lookup)
+ */
+static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
+ u8 feat_num, bool is_local)
+{
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, fn_list, node) {
+ if (entry->feat_num == feat_num && entry->is_local == is_local)
+ return entry;
+ else if (entry->feat_num > feat_num)
+ break;
+ }
+ return NULL;
}
-static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+/**
+ * dccp_feat_entry_new - Central list update routine (called by all others)
+ * @head: list to add to
+ * @feat: feature number
+ * @local: whether the local (1) or remote feature with number @feat is meant
+ *
+ * This is the only constructor and serves to ensure the above invariants.
+ */
+static struct dccp_feat_entry *
+ dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
{
- struct dccp_opt_pend *opt;
- struct dccp_minisock *dmsk = dccp_msk(sk);
- u8 *copy;
- int rc;
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, head, node)
+ if (entry->feat_num == feat && entry->is_local == local) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ return entry;
+ } else if (entry->feat_num > feat) {
+ head = &entry->node;
+ break;
+ }
- /* NN features must be Change L (sec. 6.3.2) */
- if (type != DCCPO_CHANGE_L) {
- dccp_pr_debug("received %s for NN feature %d\n",
- dccp_feat_typename(type), feature);
- return -EFAULT;
+ entry = kmalloc(sizeof(*entry), gfp_any());
+ if (entry != NULL) {
+ entry->feat_num = feat;
+ entry->is_local = local;
+ list_add_tail(&entry->node, head);
}
+ return entry;
+}
- /* XXX sanity check opt val */
+/**
+ * dccp_feat_push_change - Add/overwrite a Change option in the list
+ * @fn_list: feature-negotiation list to update
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @needs_mandatory: whether to use Mandatory feature negotiation options
+ * @fval: pointer to NN/SP value to be inserted (will be copied)
+ */
+static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
+ u8 mandatory, dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
- /* copy option so we can confirm it */
- opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
- if (opt == NULL)
+ if (new == NULL)
return -ENOMEM;
- copy = kmemdup(val, len, GFP_ATOMIC);
- if (copy == NULL) {
- kfree(opt);
- return -ENOMEM;
- }
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_INITIALISING;
+ new->needs_confirm = false;
+ new->empty_confirm = false;
+ new->val = *fval;
+ new->needs_mandatory = mandatory;
- opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
- opt->dccpop_feat = feature;
- opt->dccpop_val = copy;
- opt->dccpop_len = len;
+ return 0;
+}
- /* change feature */
- rc = dccp_feat_update(sk, type, feature, *val);
- if (rc) {
- kfree(opt->dccpop_val);
- kfree(opt);
- return rc;
- }
+/**
+ * dccp_feat_push_confirm - Add a Confirm entry to the FN list
+ * @fn_list: feature-negotiation list to add to
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is being confirmed
+ * @fval: pointer to NN/SP value to be inserted or NULL
+ *
+ * Returns 0 on success, a Reset code for further processing otherwise.
+ */
+static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
+ dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
- dccp_feat_debug(type, feature, *copy);
+ if (new == NULL)
+ return DCCP_RESET_CODE_TOO_BUSY;
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_STABLE; /* transition in 6.6.2 */
+ new->needs_confirm = true;
+ new->empty_confirm = (fval == NULL);
+ new->val.nn = 0; /* zeroes the whole structure */
+ if (!new->empty_confirm)
+ new->val = *fval;
+ new->needs_mandatory = false;
return 0;
}
-static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
- u8 type, u8 feature)
+static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
{
- /* XXX check if other confirms for that are queued and recycle slot */
- struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
+ return dccp_feat_push_confirm(fn_list, feat, local, NULL);
+}
- if (opt == NULL) {
- /* XXX what do we do? Ignoring should be fine. It's a change
- * after all =P
- */
- return;
- }
+static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
+{
+ list_del(&entry->node);
+ dccp_feat_entry_destructor(entry);
+}
- switch (type) {
- case DCCPO_CHANGE_L:
- opt->dccpop_type = DCCPO_CONFIRM_R;
- break;
- case DCCPO_CHANGE_R:
- opt->dccpop_type = DCCPO_CONFIRM_L;
- break;
- default:
- DCCP_WARN("invalid type %d\n", type);
- kfree(opt);
- return;
+void dccp_feat_list_purge(struct list_head *fn_list)
+{
+ struct dccp_feat_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, fn_list, node)
+ dccp_feat_entry_destructor(entry);
+ INIT_LIST_HEAD(fn_list);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
+
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+ struct dccp_feat_entry *entry, *new;
+
+ INIT_LIST_HEAD(to);
+ list_for_each_entry(entry, from, node) {
+ new = dccp_feat_clone_entry(entry);
+ if (new == NULL)
+ goto cloning_failed;
+ list_add_tail(&new->node, to);
}
- opt->dccpop_feat = feature;
- opt->dccpop_val = NULL;
- opt->dccpop_len = 0;
+ return 0;
- /* change feature */
- dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
+cloning_failed:
+ dccp_feat_list_purge(to);
+ return -ENOMEM;
+}
- list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
+/**
+ * dccp_feat_valid_nn_length - Enforce length constraints on NN options
+ * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
+ * incoming options are accepted as long as their values are valid.
+ */
+static u8 dccp_feat_valid_nn_length(u8 feat_num)
+{
+ if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
+ return 2;
+ if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
+ return 6;
+ return 0;
+}
+
+static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
+{
+ switch (feat_num) {
+ case DCCPF_ACK_RATIO:
+ return val <= DCCPF_ACK_RATIO_MAX;
+ case DCCPF_SEQUENCE_WINDOW:
+ return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
+ }
+ return 0; /* feature unknown - so we can't tell */
}
-static void dccp_feat_flush_confirm(struct sock *sk)
+/* check that SP values are within the ranges defined in RFC 4340 */
+static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
{
- struct dccp_minisock *dmsk = dccp_msk(sk);
- /* Check if there is anything to confirm in the first place */
- int yes = !list_empty(&dmsk->dccpms_conf);
+ switch (feat_num) {
+ case DCCPF_CCID:
+ return val == DCCPC_CCID2 || val == DCCPC_CCID3;
+ /* Type-check Boolean feature values: */
+ case DCCPF_SHORT_SEQNOS:
+ case DCCPF_ECN_INCAPABLE:
+ case DCCPF_SEND_ACK_VECTOR:
+ case DCCPF_SEND_NDP_COUNT:
+ case DCCPF_DATA_CHECKSUM:
+ case DCCPF_SEND_LEV_RATE:
+ return val < 2;
+ case DCCPF_MIN_CSUM_COVER:
+ return val < 16;
+ }
+ return 0; /* feature unknown */
+}
- if (!yes) {
- struct dccp_opt_pend *opt;
+static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
+{
+ if (sp_list == NULL || sp_len < 1)
+ return 0;
+ while (sp_len--)
+ if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
+ return 0;
+ return 1;
+}
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (opt->dccpop_conf) {
- yes = 1;
- break;
+/**
+ * dccp_feat_insert_opts - Generate FN options from current list state
+ * @skb: next sk_buff to be sent to the peer
+ * @dp: for client during handshake and general negotiation
+ * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
+ */
+int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ struct dccp_feat_entry *pos, *next;
+ u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
+ bool rpt;
+
+ /* put entries into @skb in the order they appear in the list */
+ list_for_each_entry_safe_reverse(pos, next, fn, node) {
+ opt = dccp_feat_genopt(pos);
+ type = dccp_feat_type(pos->feat_num);
+ rpt = false;
+
+ if (pos->empty_confirm) {
+ len = 0;
+ ptr = NULL;
+ } else {
+ if (type == FEAT_SP) {
+ len = pos->val.sp.len;
+ ptr = pos->val.sp.vec;
+ rpt = pos->needs_confirm;
+ } else if (type == FEAT_NN) {
+ len = dccp_feat_valid_nn_length(pos->feat_num);
+ ptr = nn_in_nbo;
+ dccp_encode_value_var(pos->val.nn, ptr, len);
+ } else {
+ DCCP_BUG("unknown feature %u", pos->feat_num);
+ return -1;
}
}
+ dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
+
+ if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
+ return -1;
+ if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
+ return -1;
+
+ if (skb->sk->sk_state == DCCP_OPEN &&
+ (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
+ /*
+ * Confirms don't get retransmitted (6.6.3) once the
+ * connection is in state OPEN
+ */
+ dccp_feat_list_pop(pos);
+ } else {
+ /*
+ * Enter CHANGING after transmitting the Change
+ * option (6.6.2).
+ */
+ if (pos->state == FEAT_INITIALISING)
+ pos->state = FEAT_CHANGING;
+ }
}
+ return 0;
+}
- if (!yes)
- return;
+/**
+ * __feat_register_nn - Register new NN value on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an NN feature from %dccp_feature_numbers
+ * @mandatory: use Mandatory option if 1
+ * @nn_val: value to register (restricted to 4 bytes)
+ *
+ * Note that NN features are local by definition (RFC 4340, 6.3.2).
+ */
+static int __feat_register_nn(struct list_head *fn, u8 feat,
+ u8 mandatory, u64 nn_val)
+{
+ dccp_feat_val fval = { .nn = nn_val };
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ /* Don't bother with default values, they will be activated anyway. */
+ if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
+ return 0;
- /* OK there is something to confirm... */
- /* XXX check if packet is in flight? Send delayed ack?? */
- if (sk->sk_state == DCCP_OPEN)
- dccp_send_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
}
-int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+/**
+ * __feat_register_sp - Register new SP value/list on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an SP feature from %dccp_feature_numbers
+ * @is_local: whether the local (1) or the remote (0) @feat is meant
+ * @mandatory: use Mandatory option if 1
+ * @sp_val: SP value followed by optional preference list
+ * @sp_len: length of @sp_val in bytes
+ */
+static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
+ u8 mandatory, u8 const *sp_val, u8 sp_len)
{
- int rc;
+ dccp_feat_val fval;
- dccp_feat_debug(type, feature, *val);
+ if (dccp_feat_type(feat) != FEAT_SP ||
+ !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
+ return -EINVAL;
- /* figure out if it's SP or NN feature */
- switch (feature) {
- /* deal with SP features */
- case DCCPF_CCID:
- rc = dccp_feat_sp(sk, type, feature, val, len);
- break;
+ /* Avoid negotiating alien CCIDs by only advertising supported ones */
+ if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+ return -EOPNOTSUPP;
- /* deal with NN features */
- case DCCPF_ACK_RATIO:
- rc = dccp_feat_nn(sk, type, feature, val, len);
- break;
+ if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
+ return -ENOMEM;
- /* XXX implement other features */
- default:
- dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
- dccp_feat_typename(type), feature);
- rc = -EFAULT;
- break;
+ return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+}
+
+/**
+ * dccp_feat_register_sp - Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len)
+{ /* any changes must be registered before establishing the connection */
+ if (sk->sk_state != DCCP_CLOSED)
+ return -EISCONN;
+ if (dccp_feat_type(feat) != FEAT_SP)
+ return -EINVAL;
+ return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+ 0, list, len);
+}
+
+/**
+ * dccp_feat_nn_get - Query current/pending value of NN feature
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ *
+ * For a known NN feature, returns value currently being negotiated, or
+ * current (confirmed) value if no negotiation is going on.
+ */
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
+{
+ if (dccp_feat_type(feat) == FEAT_NN) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *entry;
+
+ entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
+ if (entry != NULL)
+ return entry->val.nn;
+
+ switch (feat) {
+ case DCCPF_ACK_RATIO:
+ return dp->dccps_l_ack_ratio;
+ case DCCPF_SEQUENCE_WINDOW:
+ return dp->dccps_l_seq_win;
+ }
+ }
+ DCCP_BUG("attempt to look up unsupported feature %u", feat);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
+
+/**
+ * dccp_feat_signal_nn_change - Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ *
+ * This function is used to communicate NN updates out-of-band.
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ dccp_feat_val fval = { .nn = nn_val };
+ struct dccp_feat_entry *entry;
+
+ if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+ return 0;
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ if (nn_val == dccp_feat_nn_get(sk, feat))
+ return 0; /* already set or negotiation under way */
+
+ entry = dccp_feat_list_lookup(fn, feat, 1);
+ if (entry != NULL) {
+ dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
+ (unsigned long long)entry->val.nn,
+ (unsigned long long)nn_val);
+ dccp_feat_list_pop(entry);
}
- /* check if there were problems changing features */
- if (rc) {
- /* If we don't agree on SP, we sent a confirm for old value.
- * However we propagate rc to caller in case option was
- * mandatory
+ inet_csk_schedule_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, 0, &fval);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
+
+/*
+ * Tracking features whose value depend on the choice of CCID
+ *
+ * This is designed with an extension in mind so that a list walk could be done
+ * before activating any features. However, the existing framework was found to
+ * work satisfactorily up until now, the automatic verification is left open.
+ * When adding new CCIDs, add a corresponding dependency table here.
+ */
+static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
+{
+ static const struct ccid_dependency ccid2_dependencies[2][2] = {
+ /*
+ * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
+ * feature and Send Ack Vector is an RX feature, `is_local'
+ * needs to be reversed.
*/
- if (rc != DCCP_FEAT_SP_NOAGREE)
- dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
+ { /* Dependencies of the receiver-side (remote) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ },
+ { /* Dependencies of the sender-side (local) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ static const struct ccid_dependency ccid3_dependencies[2][5] = {
+ { /*
+ * Dependencies of the receiver-side CCID3
+ */
+ { /* locally disable Ack Vectors */
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* see below why Send Loss Event Rate is on */
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* NDP Count is needed as per RFC 4342, 6.1.1 */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 },
+ },
+ { /*
+ * CCID3 at the TX side: we request that the HC-receiver
+ * will not send Ack Vectors (they will be ignored, so
+ * Mandatory is not set); we enable Send Loss Event Rate
+ * (Mandatory since the implementation does not support
+ * the Loss Intervals option of RFC 4342, 8.6).
+ * The last two options are for peer's information only.
+ */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = false,
+ .val = 0
+ },
+ {
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* this CCID does not support Ack Ratio */
+ .dependent_feat = DCCPF_ACK_RATIO,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* tell receiver we are sending NDP counts */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ switch (ccid) {
+ case DCCPC_CCID2:
+ return ccid2_dependencies[is_local];
+ case DCCPC_CCID3:
+ return ccid3_dependencies[is_local];
+ default:
+ return NULL;
}
+}
- /* generate the confirm [if required] */
- dccp_feat_flush_confirm(sk);
-
+/**
+ * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
+ * @fn: feature-negotiation list to update
+ * @id: CCID number to track
+ * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ *
+ * This function needs to be called after registering all other features.
+ */
+static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
+{
+ const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
+ int i, rc = (table == NULL);
+
+ for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
+ if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
+ rc = __feat_register_sp(fn, table[i].dependent_feat,
+ table[i].is_local,
+ table[i].is_mandatory,
+ &table[i].val, 1);
+ else
+ rc = __feat_register_nn(fn, table[i].dependent_feat,
+ table[i].is_mandatory,
+ table[i].val);
return rc;
}
-EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
+/**
+ * dccp_feat_finalise_settings - Finalise settings before starting negotiation
+ * @dp: client or listening socket (settings will be inherited)
+ *
+ * This is called after all registrations (socket initialisation, sysctls, and
+ * sockopt calls), and before sending the first packet containing Change options
+ * (ie. client-Request or server-Response), to ensure internal consistency.
+ */
+int dccp_feat_finalise_settings(struct dccp_sock *dp)
+{
+ struct list_head *fn = &dp->dccps_featneg;
+ struct dccp_feat_entry *entry;
+ int i = 2, ccids[2] = { -1, -1 };
-int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len)
+ /*
+ * Propagating CCIDs:
+ * 1) not useful to propagate CCID settings if this host advertises more
+ * than one CCID: the choice of CCID may still change - if this is
+ * the client, or if this is the server and the client sends
+ * singleton CCID values.
+ * 2) since is that propagate_ccid changes the list, we defer changing
+ * the sorted list until after the traversal.
+ */
+ list_for_each_entry(entry, fn, node)
+ if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
+ ccids[entry->is_local] = entry->val.sp.vec[0];
+ while (i--)
+ if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
+ return -1;
+ dccp_feat_print_fnlist(fn);
+ return 0;
+}
+
+/**
+ * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
+ * It is the server which resolves the dependencies once the CCID has been
+ * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
+ */
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
{
- u8 t;
- struct dccp_opt_pend *opt;
- struct dccp_minisock *dmsk = dccp_msk(sk);
- int found = 0;
- int all_confirmed = 1;
+ struct list_head *fn = &dreq->dreq_featneg;
+ struct dccp_feat_entry *entry;
+ u8 is_local, ccid;
- dccp_feat_debug(type, feature, *val);
+ for (is_local = 0; is_local <= 1; is_local++) {
+ entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
- /* locate our change request */
- switch (type) {
- case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
- case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
- default: DCCP_WARN("invalid type %d\n", type);
- return 1;
+ if (entry != NULL && !entry->empty_confirm)
+ ccid = entry->val.sp.vec[0];
+ else
+ ccid = dccp_feat_default_value(DCCPF_CCID);
+ if (dccp_feat_propagate_ccid(fn, ccid, is_local))
+ return -1;
}
- /* XXX sanity check feature value */
+ return 0;
+}
+
+/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
+static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+{
+ u8 c, s;
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- if (!opt->dccpop_conf && opt->dccpop_type == t &&
- opt->dccpop_feat == feature) {
- found = 1;
- dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
+ for (s = 0; s < slen; s++)
+ for (c = 0; c < clen; c++)
+ if (servlist[s] == clilist[c])
+ return servlist[s];
+ return -1;
+}
- /* XXX do sanity check */
+/**
+ * dccp_feat_prefer - Move preferred entry to the start of array
+ * Reorder the @array_len elements in @array so that @preferred_value comes
+ * first. Returns >0 to indicate that @preferred_value does occur in @array.
+ */
+static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
+{
+ u8 i, does_occur = 0;
- opt->dccpop_conf = 1;
+ if (array != NULL) {
+ for (i = 0; i < array_len; i++)
+ if (array[i] == preferred_value) {
+ array[i] = array[0];
+ does_occur++;
+ }
+ if (does_occur)
+ array[0] = preferred_value;
+ }
+ return does_occur;
+}
- /* We got a confirmation---change the option */
- dccp_feat_update(sk, opt->dccpop_type,
- opt->dccpop_feat, *val);
+/**
+ * dccp_feat_reconcile - Reconcile SP preference lists
+ * @fval: SP list to reconcile into
+ * @arr: received SP preference list
+ * @len: length of @arr in bytes
+ * @is_server: whether this side is the server (and @fv is the server's list)
+ * @reorder: whether to reorder the list in @fv after reconciling with @arr
+ * When successful, > 0 is returned and the reconciled list is in @fval.
+ * A value of 0 means that negotiation failed (no shared entry).
+ */
+static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
+ bool is_server, bool reorder)
+{
+ int rc;
- /* XXX check the return value of dccp_feat_update */
- break;
- }
+ if (!fv->sp.vec || !arr) {
+ DCCP_CRIT("NULL feature value or array");
+ return 0;
+ }
+
+ if (is_server)
+ rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
+ else
+ rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
+
+ if (!reorder)
+ return rc;
+ if (rc < 0)
+ return 0;
+
+ /*
+ * Reorder list: used for activating features and in dccp_insert_fn_opt.
+ */
+ return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
+}
+
+/**
+ * dccp_feat_change_recv - Process incoming ChangeL/R options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether the Change was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is the server (1) or the client (0)
+ */
+static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
+{
+ u8 defval, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CHANGE_R);
+ struct dccp_feat_entry *entry;
+ dccp_feat_val fval;
- if (!opt->dccpop_conf)
- all_confirmed = 0;
+ if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
+ goto unknown_feature_or_value;
+
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+ /*
+ * Negotiation of NN features: Change R is invalid, so there is no
+ * simultaneous negotiation; hence we do not look up in the list.
+ */
+ if (type == FEAT_NN) {
+ if (local || len > sizeof(fval.nn))
+ goto unknown_feature_or_value;
+
+ /* 6.3.2: "The feature remote MUST accept any valid value..." */
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto unknown_feature_or_value;
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
}
- /* fix re-transmit timer */
- /* XXX gotta make sure that no option negotiation occurs during
- * connection shutdown. Consider that the CLOSEREQ is sent and timer is
- * on. if all options are confirmed it might kill timer which should
- * remain alive until close is received.
+ /*
+ * Unidirectional/simultaneous negotiation of SP features (6.3.1)
*/
- if (all_confirmed) {
- dccp_pr_debug("clear feat negotiation timer %p\n", sk);
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL) {
+ /*
+ * No particular preferences have been registered. We deal with
+ * this situation by assuming that all valid values are equally
+ * acceptable, and apply the following checks:
+ * - if the peer's list is a singleton, we accept a valid value;
+ * - if we are the server, we first try to see if the peer (the
+ * client) advertises the default value. If yes, we use it,
+ * otherwise we accept the preferred value;
+ * - else if we are the client, we use the first list element.
+ */
+ if (dccp_feat_clone_sp_val(&fval, val, 1))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ if (len > 1 && server) {
+ defval = dccp_feat_default_value(feat);
+ if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
+ fval.sp.vec[0] = defval;
+ } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
+ kfree(fval.sp.vec);
+ goto unknown_feature_or_value;
+ }
+
+ /* Treat unsupported CCIDs like invalid values */
+ if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
+ kfree(fval.sp.vec);
+ goto not_valid_or_not_known;
+ }
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
+
+ } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
+ return 0;
}
- if (!found)
- dccp_pr_debug("%s(%d, ...) never requested\n",
- dccp_feat_typename(type), feature);
+ if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
+ entry->empty_confirm = false;
+ } else if (is_mandatory) {
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+ } else if (entry->state == FEAT_INITIALISING) {
+ /*
+ * Failed simultaneous negotiation (server only): try to `save'
+ * the connection by checking whether entry contains the default
+ * value for @feat. If yes, send an empty Confirm to signal that
+ * the received Change was not understood - which implies using
+ * the default value.
+ * If this also fails, we use Reset as the last resort.
+ */
+ WARN_ON(!server);
+ defval = dccp_feat_default_value(feat);
+ if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ entry->empty_confirm = true;
+ }
+ entry->needs_confirm = true;
+ entry->needs_mandatory = false;
+ entry->state = FEAT_STABLE;
return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
+unknown_feature_or_value:
+ if (!is_mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
-void dccp_feat_clean(struct dccp_minisock *dmsk)
+not_valid_or_not_known:
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_confirm_recv - Process received Confirm options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is server (1) or client (0)
+ */
+static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
{
- struct dccp_opt_pend *opt, *next;
+ u8 *plist, plen, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
- dccpop_node) {
- BUG_ON(opt->dccpop_val == NULL);
- kfree(opt->dccpop_val);
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
- if (opt->dccpop_sc != NULL) {
- BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- }
+ if (entry == NULL) { /* nothing queued: ignore or handle error */
+ if (is_mandatory && type == FEAT_UNKNOWN)
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+
+ if (!local && type == FEAT_NN) /* 6.3.2 */
+ goto confirmation_failed;
+ return 0;
+ }
+
+ if (entry->state != FEAT_CHANGING) /* 6.6.2 */
+ return 0;
- kfree(opt);
+ if (len == 0) {
+ if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
+ goto confirmation_failed;
+ /*
+ * Empty Confirm during connection setup: this means reverting
+ * to the `old' value, which in this case is the default. Since
+ * we handle default values automatically when no other values
+ * have been set, we revert to the old value by removing this
+ * entry from the list.
+ */
+ dccp_feat_list_pop(entry);
+ return 0;
}
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
- BUG_ON(opt == NULL);
- if (opt->dccpop_val != NULL)
- kfree(opt->dccpop_val);
- kfree(opt);
+ if (type == FEAT_NN) {
+ if (len > sizeof(entry->val.nn))
+ goto confirmation_failed;
+
+ if (entry->val.nn == dccp_decode_value_var(val, len))
+ goto confirmation_succeeded;
+
+ DCCP_WARN("Bogus Confirm for non-existing value\n");
+ goto confirmation_failed;
}
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_clean);
+ /*
+ * Parsing SP Confirms: the first element of @val is the preferred
+ * SP value which the peer confirms, the remainder depends on @len.
+ * Note that only the confirmed value need to be a valid SP value.
+ */
+ if (!dccp_feat_is_valid_sp_val(feat, *val))
+ goto confirmation_failed;
+
+ if (len == 1) { /* peer didn't supply a preference list */
+ plist = val;
+ plen = len;
+ } else { /* preferred value + preference list */
+ plist = val + 1;
+ plen = len - 1;
+ }
+
+ /* Check whether the peer got the reconciliation right (6.6.8) */
+ if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
+ DCCP_WARN("Confirm selected the wrong value %u\n", *val);
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ }
+ entry->val.sp.vec[0] = *val;
+
+confirmation_succeeded:
+ entry->state = FEAT_STABLE;
+ return 0;
+
+confirmation_failed:
+ DCCP_WARN("Confirmation failed\n");
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
-/* this is to be called only when a listening sock creates its child. It is
- * assumed by the function---the confirm is not duplicated, but rather it is
- * "passed on".
+/**
+ * dccp_feat_handle_nn_established - Fast-path reception of NN options
+ * @sk: socket of an established DCCP connection
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat: NN number, one of %dccp_feature_numbers
+ * @val: NN value
+ * @len: length of @val in bytes
+ *
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ * - cleanup after receiving the Confirm;
+ * - values are directly activated after successful parsing;
+ * - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
*/
-int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len)
{
- struct dccp_minisock *olddmsk = dccp_msk(oldsk);
- struct dccp_minisock *newdmsk = dccp_msk(newsk);
- struct dccp_opt_pend *opt;
- int rc = 0;
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry;
+ u8 type = dccp_feat_type(feat);
+ dccp_feat_val fval;
- INIT_LIST_HEAD(&newdmsk->dccpms_pending);
- INIT_LIST_HEAD(&newdmsk->dccpms_conf);
+ dccp_feat_print_opt(opt, feat, val, len, mandatory);
- list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
- struct dccp_opt_pend *newopt;
- /* copy the value of the option */
- u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
+ /* Ignore non-mandatory unknown and non-NN features */
+ if (type == FEAT_UNKNOWN) {
+ if (local && !mandatory)
+ return 0;
+ goto fast_path_unknown;
+ } else if (type != FEAT_NN) {
+ return 0;
+ }
- if (val == NULL)
- goto out_clean;
+ /*
+ * We don't accept empty Confirms, since in fast-path feature
+ * negotiation the values are enabled immediately after sending
+ * the Change option.
+ * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+ */
+ if (len == 0 || len > sizeof(fval.nn))
+ goto fast_path_unknown;
- newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
- if (newopt == NULL) {
- kfree(val);
- goto out_clean;
- }
+ if (opt == DCCPO_CHANGE_L) {
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto fast_path_unknown;
+
+ if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+ dccp_feat_activate(sk, feat, local, &fval))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ /* set the `Ack Pending' flag to piggyback a Confirm */
+ inet_csk_schedule_ack(sk);
- /* insert the option */
- newopt->dccpop_val = val;
- list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
+ } else if (opt == DCCPO_CONFIRM_R) {
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL || entry->state != FEAT_CHANGING)
+ return 0;
- /* XXX what happens with backlogs and multiple connections at
- * once...
+ fval.nn = dccp_decode_value_var(val, len);
+ /*
+ * Just ignore a value that doesn't match our current value.
+ * If the option changes twice within two RTTs, then at least
+ * one CONFIRM will be received for the old value after a
+ * new CHANGE was sent.
*/
- /* the master socket no longer needs to worry about confirms */
- opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
+ if (fval.nn != entry->val.nn)
+ return 0;
- /* reset state for a new socket */
- opt->dccpop_conf = 0;
- }
+ /* Only activate after receiving the Confirm option (6.6.1). */
+ dccp_feat_activate(sk, feat, local, &fval);
- /* XXX not doing anything about the conf queue */
+ /* It has been confirmed - so remove the entry */
+ dccp_feat_list_pop(entry);
-out:
- return rc;
+ } else {
+ DCCP_WARN("Received illegal option %u\n", opt);
+ goto fast_path_failed;
+ }
+ return 0;
-out_clean:
- dccp_feat_clean(newdmsk);
- rc = -ENOMEM;
- goto out;
-}
+fast_path_unknown:
+ if (!mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
-EXPORT_SYMBOL_GPL(dccp_feat_clone);
+fast_path_failed:
+ return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
-static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
- u8 *val, u8 len)
+/**
+ * dccp_feat_parse_options - Process Feature-Negotiation Options
+ * @sk: for general use and used by the client during connection setup
+ * @dreq: used by the server during connection setup
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: value contents of @opt
+ * @len: length of @val in bytes
+ *
+ * Returns 0 on success, a Reset code for ending the connection otherwise.
+ */
+int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+ u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
{
- int rc = -ENOMEM;
- u8 *copy = kmemdup(val, len, GFP_KERNEL);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ bool server = false;
- if (copy != NULL) {
- rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
- if (rc)
- kfree(copy);
+ switch (sk->sk_state) {
+ /*
+ * Negotiation during connection setup
+ */
+ case DCCP_LISTEN:
+ server = true; /* fall through */
+ case DCCP_REQUESTING:
+ switch (opt) {
+ case DCCPO_CHANGE_L:
+ case DCCPO_CHANGE_R:
+ return dccp_feat_change_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ case DCCPO_CONFIRM_R:
+ case DCCPO_CONFIRM_L:
+ return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ }
+ break;
+ /*
+ * Support for exchanging NN options on an established connection.
+ */
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+ val, len);
}
- return rc;
+ return 0; /* ignore FN options in all other states */
}
-int dccp_feat_init(struct dccp_minisock *dmsk)
+/**
+ * dccp_feat_init - Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
+int dccp_feat_init(struct sock *sk)
{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ u8 on = 1, off = 0;
int rc;
+ struct {
+ u8 *val;
+ u8 len;
+ } tx, rx;
+
+ /* Non-negotiable (NN) features */
+ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+ sysctl_dccp_sequence_window);
+ if (rc)
+ return rc;
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
+ /* Server-priority (SP) features */
- /* CCID L */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
- &dmsk->dccpms_tx_ccid, 1);
+ /* Advertise that short seqnos are not supported (7.6.1) */
+ rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
if (rc)
- goto out;
+ return rc;
+
+ /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+ rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+ if (rc)
+ return rc;
+
+ /*
+ * We advertise the available list of CCIDs and reorder according to
+ * preferences, to avoid failure resulting from negotiating different
+ * singleton values (which always leads to failure).
+ * These settings can still (later) be overridden via sockopts.
+ */
+ if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+ ccid_get_builtin_ccids(&rx.val, &rx.len))
+ return -ENOBUFS;
+
+ if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+ !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
+ goto free_ccid_lists;
- /* CCID R */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
- &dmsk->dccpms_rx_ccid, 1);
+ rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
if (rc)
- goto out;
+ goto free_ccid_lists;
- /* Ack ratio */
- rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
- &dmsk->dccpms_ack_ratio, 1);
-out:
+ rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+ kfree(tx.val);
+ kfree(rx.val);
return rc;
}
-EXPORT_SYMBOL_GPL(dccp_feat_init);
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-const char *dccp_feat_typename(const u8 type)
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
{
- switch(type) {
- case DCCPO_CHANGE_L: return("ChangeL");
- case DCCPO_CONFIRM_L: return("ConfirmL");
- case DCCPO_CHANGE_R: return("ChangeR");
- case DCCPO_CONFIRM_R: return("ConfirmR");
- /* the following case must not appear in feature negotation */
- default: dccp_pr_debug("unknown type %d [BUG!]\n", type);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *cur, *next;
+ int idx;
+ dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
+ [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
+ };
+
+ list_for_each_entry(cur, fn_list, node) {
+ /*
+ * An empty Confirm means that either an unknown feature type
+ * or an invalid value was present. In the first case there is
+ * nothing to activate, in the other the default value is used.
+ */
+ if (cur->empty_confirm)
+ continue;
+
+ idx = dccp_feat_index(cur->feat_num);
+ if (idx < 0) {
+ DCCP_BUG("Unknown feature %u", cur->feat_num);
+ goto activation_failed;
+ }
+ if (cur->state != FEAT_STABLE) {
+ DCCP_CRIT("Negotiation of %s %s failed in state %s",
+ cur->is_local ? "local" : "remote",
+ dccp_feat_fname(cur->feat_num),
+ dccp_feat_sname[cur->state]);
+ goto activation_failed;
+ }
+ fvals[idx][cur->is_local] = &cur->val;
}
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dccp_feat_typename);
+ /*
+ * Activate in decreasing order of index, so that the CCIDs are always
+ * activated as the last feature. This avoids the case where a CCID
+ * relies on the initialisation of one or more features that it depends
+ * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
+ */
+ for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
+ if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
+ __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
+ DCCP_CRIT("Could not activate %d", idx);
+ goto activation_failed;
+ }
-const char *dccp_feat_name(const u8 feat)
-{
- static const char *feature_names[] = {
- [DCCPF_RESERVED] = "Reserved",
- [DCCPF_CCID] = "CCID",
- [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
- [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
- [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
- [DCCPF_ACK_RATIO] = "Ack Ratio",
- [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
- [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
- [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
- [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
- };
- if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
- return feature_names[DCCPF_RESERVED];
+ /* Clean up Change options which have been confirmed already */
+ list_for_each_entry_safe(cur, next, fn_list, node)
+ if (!cur->needs_confirm)
+ dccp_feat_list_pop(cur);
- if (feat >= DCCPF_MIN_CCID_SPECIFIC)
- return "CCID-specific";
+ dccp_pr_debug("Activation OK\n");
+ return 0;
- return feature_names[feat];
+activation_failed:
+ /*
+ * We clean up everything that may have been allocated, since
+ * it is difficult to track at which stage negotiation failed.
+ * This is ok, since all allocation functions below are robust
+ * against NULL arguments.
+ */
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ return -1;
}
-
-EXPORT_SYMBOL_GPL(dccp_feat_name);
-#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index e272222c7ac..0e75cebb218 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,38 +3,135 @@
/*
* net/dccp/feat.h
*
- * An implementation of the DCCP protocol
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
* Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
*/
-
#include <linux/types.h>
#include "dccp.h"
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern const char *dccp_feat_typename(const u8 type);
-extern const char *dccp_feat_name(const u8 feat);
+/*
+ * Known limit values
+ */
+/* Ack Ratio takes 2-byte integer values (11.3) */
+#define DCCPF_ACK_RATIO_MAX 0xFFFF
+/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
+#define DCCPF_SEQ_WMIN 32
+#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
+
+enum dccp_feat_type {
+ FEAT_AT_RX = 1, /* located at RX side of half-connection */
+ FEAT_AT_TX = 2, /* located at TX side of half-connection */
+ FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
+ FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
+ FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
+};
+
+enum dccp_feat_state {
+ FEAT_DEFAULT = 0, /* using default values from 6.4 */
+ FEAT_INITIALISING, /* feature is being initialised */
+ FEAT_CHANGING, /* Change sent but not confirmed yet */
+ FEAT_UNSTABLE, /* local modification in state CHANGING */
+ FEAT_STABLE /* both ends (think they) agree */
+};
+
+/**
+ * dccp_feat_val - Container for SP or NN feature values
+ * @nn: single NN value
+ * @sp.vec: single SP value plus optional preference list
+ * @sp.len: length of @sp.vec in bytes
+ */
+typedef union {
+ u64 nn;
+ struct {
+ u8 *vec;
+ u8 len;
+ } sp;
+} dccp_feat_val;
+
+/**
+ * struct feat_entry - Data structure to perform feature negotiation
+ * @val: feature's current value (SP features may have preference list)
+ * @state: feature's current state
+ * @feat_num: one of %dccp_feature_numbers
+ * @needs_mandatory: whether Mandatory options should be sent
+ * @needs_confirm: whether to send a Confirm instead of a Change
+ * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
+ * @is_local: feature location (1) or feature-remote (0)
+ * @node: list pointers, entries arranged in FIFO order
+ */
+struct dccp_feat_entry {
+ dccp_feat_val val;
+ enum dccp_feat_state state:8;
+ u8 feat_num;
+
+ bool needs_mandatory,
+ needs_confirm,
+ empty_confirm,
+ is_local;
+
+ struct list_head node;
+};
-static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
+static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
{
- dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
- dccp_feat_name(feat), feat, val);
+ if (entry->needs_confirm)
+ return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
+ return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
}
-#else
-#define dccp_feat_debug(type, feat, val)
-#endif /* CONFIG_IP_DCCP_DEBUG */
-
-extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
- u8 *val, u8 len, gfp_t gfp);
-extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len);
-extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
- u8 *val, u8 len);
-extern void dccp_feat_clean(struct dccp_minisock *dmsk);
-extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
-extern int dccp_feat_init(struct dccp_minisock *dmsk);
+/**
+ * struct ccid_dependency - Track changes resulting from choosing a CCID
+ * @dependent_feat: one of %dccp_feature_numbers
+ * @is_local: local (1) or remote (0) @dependent_feat
+ * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
+ * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
+ */
+struct ccid_dependency {
+ u8 dependent_feat;
+ bool is_local:1,
+ is_mandatory:1;
+ u8 val;
+};
+
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int sysctl_dccp_rx_ccid;
+extern int sysctl_dccp_tx_ccid;
+
+int dccp_feat_init(struct sock *sk);
+void dccp_feat_initialise_sysctls(void);
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len);
+int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+ u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
+int dccp_feat_clone_list(struct list_head const *, struct list_head *);
+
+/*
+ * Encoding variable-length options and their maximum length.
+ *
+ * This affects NN options (SP options are all u8) and other variable-length
+ * options (see table 3 in RFC 4340). The limit is currently given the Sequence
+ * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
+ * options consume less than 6 bytes (timestamps are 4 bytes).
+ * When updating this constant (e.g. due to new internet drafts / RFCs), make
+ * sure that you also update all code which refers to it.
+ */
+#define DCCP_OPTVAL_MAXLEN 6
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
+
+int dccp_insert_option_mandatory(struct sk_buff *skb);
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
+ bool repeat_first);
#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 08392ed86c2..3c8ec7d4a34 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
@@ -27,7 +28,7 @@ static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
static void dccp_fin(struct sock *sk, struct sk_buff *skb)
@@ -123,9 +124,9 @@ static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
return queued;
}
-static u8 dccp_reset_code_convert(const u8 code)
+static u16 dccp_reset_code_convert(const u8 code)
{
- const u8 error_code[] = {
+ const u16 error_code[] = {
[DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
[DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
[DCCP_RESET_CODE_ABORTED] = ECONNRESET,
@@ -147,7 +148,7 @@ static u8 dccp_reset_code_convert(const u8 code)
static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
{
- u8 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
+ u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
sk->sk_err = err;
@@ -159,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
}
-static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
{
- struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
- if (dccp_msk(sk)->dccpms_send_ack_vector)
- dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ if (av == NULL)
+ return;
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvec_input(av, skb);
}
static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -238,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
dccp_update_gsr(sk, seqno);
if (dh->dccph_type != DCCP_PKT_SYNC &&
- (ackno != DCCP_PKT_WITHOUT_ACK_SEQ))
+ ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+ after48(ackno, dp->dccps_gar))
dp->dccps_gar = ackno;
} else {
unsigned long now = jiffies;
@@ -256,9 +260,9 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
*/
if (time_before(now, (dp->dccps_rate_last +
sysctl_dccp_sync_ratelimit)))
- return 0;
+ return -1;
- DCCP_WARN("DCCP: Step 6 failed for %s packet, "
+ DCCP_WARN("Step 6 failed for %s packet, "
"(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
"(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
"sending SYNC...\n", dccp_packet_name(dh->dccph_type),
@@ -281,7 +285,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
}
static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+ const struct dccp_hdr *dh, const unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -362,24 +366,15 @@ discard:
}
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+ const struct dccp_hdr *dh, const unsigned int len)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
if (dccp_check_seqno(sk, skb))
goto discard;
if (dccp_parse_options(sk, NULL, skb))
- goto discard;
-
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
+ return 1;
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
return __dccp_rcv_established(sk, skb, dh, len);
@@ -393,7 +388,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_established);
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
/*
* Step 4: Prepare sequence numbers in REQUEST
@@ -411,50 +406,43 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct dccp_sock *dp = dccp_sk(sk);
long tstamp = dccp_timestamp();
- /* Stop the REQUEST timer */
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- BUG_TRAP(sk->sk_send_head != NULL);
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
-
if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
dp->dccps_awl, dp->dccps_awh)) {
dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu \n",
+ "P.ackno=%llu, S.AWH=%llu\n",
(unsigned long long)dp->dccps_awl,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
(unsigned long long)dp->dccps_awh);
goto out_invalid_packet;
}
+ /*
+ * If option processing (Step 8) failed, return 1 here so that
+ * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
+ * the option type and is set in dccp_parse_options().
+ */
if (dccp_parse_options(sk, NULL, skb))
- goto out_invalid_packet;
+ return 1;
- /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
dp->dccps_options_received.dccpor_timestamp_echo));
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto out_invalid_packet; /* FIXME: change error code */
+ /* Stop the REQUEST timer */
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ WARN_ON(sk->sk_send_head == NULL);
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
- dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_update_gsr(sk, dp->dccps_isr);
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- *
- * AWL was adjusted in dccp_v4_connect -acme
+ * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+ * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+ * is done as part of activating the feature values below, since
+ * these settings depend on the local/remote Sequence Window
+ * features, which were undefined or not confirmed until now.
*/
- dccp_set_seqno(&dp->dccps_swl,
- max48(dp->dccps_swl, dp->dccps_isr));
+ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -475,6 +463,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
*/
dccp_set_state(sk, DCCP_PARTOPEN);
+ /*
+ * If feature negotiation was successful, activate features now;
+ * an activation failure means that this host could not activate
+ * one ore more features (e.g. insufficient memory), which would
+ * leave at least one feature in an undefined state.
+ */
+ if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
+ goto unable_to_proceed;
+
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
@@ -509,13 +506,25 @@ out_invalid_packet:
/* dccp_v4_do_rcv will send a reset */
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
return 1;
+
+unable_to_proceed:
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
+ /*
+ * We mark this socket as no longer usable, so that the loop in
+ * dccp_sendmsg() terminates and the application gets notified.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ sk->sk_err = ECOMM;
+ return 1;
}
static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
int queued = 0;
switch (dh->dccph_type) {
@@ -540,7 +549,14 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
if (sk->sk_state == DCCP_PARTOPEN)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(sample)) {
+ long delta = dccp_timestamp() - sample;
+
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+ }
+
+ dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_set_state(sk, DCCP_OPEN);
if (dh->dccph_type == DCCP_PKT_DATAACK ||
@@ -556,7 +572,7 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
}
int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len)
+ struct dccp_hdr *dh, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -590,8 +606,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
skb) < 0)
return 1;
-
- /* FIXME: do congestion control initialization */
goto discard;
}
if (dh->dccph_type == DCCP_PKT_RESET)
@@ -600,30 +614,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Caller (dccp_v4_do_rcv) will send Reset */
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
+ } else if (sk->sk_state == DCCP_CLOSED) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
}
- if (sk->sk_state != DCCP_REQUESTING) {
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 8: Process options and mark acknowledgeable
- */
- if (dccp_parse_options(sk, NULL, skb))
- goto discard;
-
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
+ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
+ goto discard;
- dccp_deliver_input_to_ccids(sk, skb);
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_RESPONSE) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+ goto discard;
}
+ /* Step 8: Process options */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
/*
* Step 9: Process Reset
* If P.type == Reset,
@@ -631,44 +651,22 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
- */
+ */
if (dh->dccph_type == DCCP_PKT_RESET) {
dccp_rcv_reset(sk, skb);
return 0;
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_RESPONSE) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND &&
- dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
if (dccp_rcv_closereq(sk, skb))
return 0;
goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
if (dccp_rcv_close(sk, skb))
return 0;
goto discard;
}
switch (sk->sk_state) {
- case DCCP_CLOSED:
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
-
case DCCP_REQUESTING:
- /* FIXME: do congestion control initialization */
-
queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
if (queued >= 0)
return queued;
@@ -676,8 +674,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
__kfree_skb(skb);
return 0;
- case DCCP_RESPOND:
case DCCP_PARTOPEN:
+ /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
+ /* fall through */
+ case DCCP_RESPOND:
queued = dccp_rcv_respond_partopen_state_process(sk, skb,
dh, len);
break;
@@ -708,6 +710,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
/**
* dccp_sample_rtt - Validate and finalise computation of RTT sample
* @delta: number of microseconds between packet and acknowledgment
+ *
* The routine is kept generic to work in different contexts. It should be
* called immediately when the ACK used for the RTT sample arrives.
*/
@@ -727,5 +730,3 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
return delta;
}
-
-EXPORT_SYMBOL_GPL(dccp_sample_rtt);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 474075adbde..6ca645c4b48 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/icmp.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/random.h>
@@ -25,6 +26,7 @@
#include <net/timewait_sock.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "ackvec.h"
#include "ccid.h"
@@ -32,21 +34,22 @@
#include "feat.h"
/*
- * This is the global socket data structure used for responding to
+ * The per-net dccp.v4_ctl_sk socket is used for responding to
* the Out-of-the-blue (OOTB) packets. A control sock will be created
* for this socket at the initialization time.
*/
-static struct socket *dccp_v4_ctl_socket;
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct rtable *rt;
+ __be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
- int tmp;
+ struct flowi4 *fl4;
+ struct rtable *rt;
int err;
+ struct ip_options_rcu *inet_opt;
dp->dccps_role = DCCP_ROLE_CLIENT;
@@ -57,37 +60,43 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt != NULL && inet->opt->srr) {
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt != NULL && inet_opt->opt.srr) {
if (daddr == 0)
return -EINVAL;
- nexthop = inet->opt->faddr;
+ nexthop = inet_opt->opt.faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- inet->sport, usin->sin_port, sk, 1);
- if (tmp < 0)
- return tmp;
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
- if (inet->opt == NULL || !inet->opt->srr)
- daddr = rt->rt_dst;
+ if (inet_opt == NULL || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
- if (inet->saddr == 0)
- inet->saddr = rt->rt_src;
- inet->rcv_saddr = inet->saddr;
+ if (inet->inet_saddr == 0)
+ inet->inet_saddr = fl4->saddr;
+ inet->inet_rcv_saddr = inet->inet_saddr;
- inet->dport = usin->sin_port;
- inet->daddr = daddr;
+ inet->inet_dport = usin->sin_port;
+ inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt != NULL)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
/*
* Socket identity is still unknown (sport may be zero).
* However we set state to DCCP_REQUESTING and not releasing socket
@@ -99,17 +108,21 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err != 0)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_DCCP, inet->sport, inet->dport,
- sk);
- if (err != 0)
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
- dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, inet->daddr,
- inet->sport, inet->dport);
- inet->id = dp->dccps_iss ^ jiffies;
+ dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ inet->inet_dport);
+ inet->inet_id = dp->dccps_iss ^ jiffies;
err = dccp_connect(sk);
rt = NULL;
@@ -124,7 +137,7 @@ failure:
dccp_set_state(sk, DCCP_CLOSED);
ip_rt_put(rt);
sk->sk_route_caps = 0;
- inet->dport = 0;
+ inet->inet_dport = 0;
goto out;
}
@@ -148,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
if (sk->sk_state == DCCP_LISTEN)
return;
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
@@ -168,6 +174,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
dccp_sync_mss(sk, mtu);
@@ -182,6 +189,14 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
} /* else let the usual retransmit timer handle it */
}
+static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
/*
* This routine is called by the ICMP module when it gets some sort of error
* condition. If err < 0 then the socket should be closed and the error
@@ -197,8 +212,8 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
static void dccp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (struct iphdr *)skb->data;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
- (iph->ihl << 2));
+ const u8 offset = iph->ihl << 2;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
struct dccp_sock *dp;
struct inet_sock *inet;
const int type = icmp_hdr(skb)->type;
@@ -206,16 +221,19 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
struct sock *sk;
__u64 seq;
int err;
+ struct net *net = dev_net(skb->dev);
- if (skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
- sk = inet_lookup(&init_net, &dccp_hashinfo, iph->daddr, dh->dccph_dport,
- iph->saddr, dh->dccph_sport, inet_iif(skb));
+ sk = inet_lookup(net, &dccp_hashinfo,
+ iph->daddr, dh->dccph_dport,
+ iph->saddr, dh->dccph_sport, inet_iif(skb));
if (sk == NULL) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
@@ -229,7 +247,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
* servers this needs to be solved differently.
*/
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
@@ -237,12 +255,15 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
dp = dccp_sk(sk);
seq = dccp_hdr_seq(dh);
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
- !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
+ case ICMP_REDIRECT:
+ dccp_do_redirect(skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -282,10 +303,11 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
- BUG_TRAP(!req->sk);
+ WARN_ON(req->sk);
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
/*
@@ -344,13 +366,15 @@ static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
}
-void dccp_v4_send_check(struct sock *sk, int unused, struct sk_buff *skb)
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
const struct inet_sock *inet = inet_sk(sk);
struct dccp_hdr *dh = dccp_hdr(skb);
dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb, inet->saddr, inet->daddr);
+ dh->dccph_checksum = dccp_v4_csum_finish(skb,
+ inet->inet_saddr,
+ inet->inet_daddr);
}
EXPORT_SYMBOL_GPL(dccp_v4_send_check);
@@ -380,39 +404,45 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto exit;
-
- sk_setup_caps(newsk, dst);
+ goto exit_nonewsk;
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->daddr = ireq->rmt_addr;
- newinet->rcv_saddr = ireq->loc_addr;
- newinet->saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
+ newinet->inet_daddr = ireq->ir_rmt_addr;
+ newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ newinet->inet_opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
- newinet->id = jiffies;
+ newinet->inet_id = jiffies;
+
+ if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+ goto put_and_exit;
+
+ sk_setup_caps(newsk, dst);
dccp_sync_mss(newsk, dst_mtu(dst));
- __inet_hash_nolisten(newsk);
- __inet_inherit_port(sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ __inet_hash_nolisten(newsk, NULL);
return newsk;
exit_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-exit:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
dst_release(dst);
+exit:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto exit;
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
@@ -430,7 +460,7 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
if (req != NULL)
return dccp_check_req(sk, skb, req, prev);
- nsk = inet_lookup_established(&init_net, &dccp_hashinfo,
+ nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
iph->saddr, dh->dccph_sport,
iph->daddr, dh->dccph_dport,
inet_iif(skb));
@@ -446,40 +476,40 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
return sk;
}
-static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
+static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct rtable *rt;
- struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
- .nl_u = { .ip4_u =
- { .daddr = ip_hdr(skb)->saddr,
- .saddr = ip_hdr(skb)->daddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = dccp_hdr(skb)->dccph_dport,
- .dport = dccp_hdr(skb)->dccph_sport }
- }
- };
-
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .flowi4_oif = inet_iif(skb),
+ .daddr = iph->saddr,
+ .saddr = iph->daddr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .fl4_sport = dccp_hdr(skb)->dccph_dport,
+ .fl4_dport = dccp_hdr(skb)->dccph_sport,
+ };
+
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
- return &rt->u.dst;
+ return &rt->dst;
}
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
{
int err = -1;
struct sk_buff *skb;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
- /* First, grab a route. */
-
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+ dst = inet_csk_route_req(sk, &fl4, req);
+ if (dst == NULL)
goto out;
skb = dccp_make_response(sk, dst, req);
@@ -487,11 +517,10 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
const struct inet_request_sock *ireq = inet_rsk(req);
struct dccp_hdr *dh = dccp_hdr(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
- ireq->rmt_addr);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
+ dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
}
@@ -507,31 +536,33 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
const struct iphdr *rxiph;
struct sk_buff *skb;
struct dst_entry *dst;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v4_ctl_sk;
/* Never send a reset in response to a reset. */
if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
return;
- if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
return;
- dst = dccp_v4_route_skb(dccp_v4_ctl_socket->sk, rxskb);
+ dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
if (dst == NULL)
return;
- skb = dccp_ctl_make_reset(dccp_v4_ctl_socket, rxskb);
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
if (skb == NULL)
goto out;
rxiph = ip_hdr(rxskb);
dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
rxiph->daddr);
- skb->dst = dst_clone(dst);
+ skb_dst_set(skb, dst_clone(dst));
- bh_lock_sock(dccp_v4_ctl_socket->sk);
- err = ip_build_and_send_pkt(skb, dccp_v4_ctl_socket->sk,
+ bh_lock_sock(ctl_sk);
+ err = ip_build_and_send_pkt(skb, ctl_sk,
rxiph->daddr, rxiph->saddr, NULL);
- bh_unlock_sock(dccp_v4_ctl_socket->sk);
+ bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
@@ -543,9 +574,15 @@ out:
static void dccp_v4_reqsk_destructor(struct request_sock *req)
{
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
kfree(inet_rsk(req)->opt);
}
+void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+}
+EXPORT_SYMBOL(dccp_syn_ack_timeout);
+
static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct dccp_request_sock),
@@ -553,6 +590,7 @@ static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
.send_ack = dccp_reqsk_send_ack,
.destructor = dccp_v4_reqsk_destructor,
.send_reset = dccp_v4_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
};
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -564,8 +602,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
- if (((struct rtable *)skb->dst)->rt_flags &
- (RTCF_BROADCAST | RTCF_MULTICAST))
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
return 0; /* discard, don't send a reset here */
if (dccp_bad_service_code(sk, service)) {
@@ -590,11 +627,12 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = reqsk_alloc(&dccp_request_sock_ops);
+ req = inet_reqsk_alloc(&dccp_request_sock_ops);
if (req == NULL)
goto drop;
- dccp_reqsk_init(req, skb);
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
dreq = dccp_rsk(req);
if (dccp_parse_options(sk, dreq, skb))
@@ -604,23 +642,23 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
ireq = inet_rsk(req);
- ireq->loc_addr = ip_hdr(skb)->daddr;
- ireq->rmt_addr = ip_hdr(skb)->saddr;
- ireq->opt = NULL;
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
/*
* Step 3: Process LISTEN state
*
* Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
*
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
*/
dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
dreq->dreq_iss = dccp_v4_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v4_send_response(sk, req, NULL))
+ if (dccp_v4_send_response(sk, req))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -740,8 +778,8 @@ int dccp_invalid_packet(struct sk_buff *skb)
* If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
* has short sequence numbers), drop packet and return
*/
- if (dh->dccph_type >= DCCP_PKT_DATA &&
- dh->dccph_type <= DCCP_PKT_DATAACK && dh->dccph_x == 0) {
+ if ((dh->dccph_type < DCCP_PKT_DATA ||
+ dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
dccp_packet_name(dh->dccph_type));
return 1;
@@ -792,12 +830,10 @@ static int dccp_v4_rcv(struct sk_buff *skb)
DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
- dccp_pr_debug("%8.8s "
- "src=%u.%u.%u.%u@%-5d "
- "dst=%u.%u.%u.%u@%-5d seq=%llu",
+ dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
dccp_packet_name(dh->dccph_type),
- NIPQUAD(iph->saddr), ntohs(dh->dccph_sport),
- NIPQUAD(iph->daddr), ntohs(dh->dccph_dport),
+ &iph->saddr, ntohs(dh->dccph_sport),
+ &iph->daddr, ntohs(dh->dccph_dport),
(unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
if (dccp_packet_without_ack(skb)) {
@@ -811,9 +847,8 @@ static int dccp_v4_rcv(struct sk_buff *skb)
/* Step 2:
* Look up flow ID in table and get corresponding socket */
- sk = __inet_lookup(&init_net, &dccp_hashinfo,
- iph->saddr, dh->dccph_sport,
- iph->daddr, dh->dccph_dport, inet_iif(skb));
+ sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport);
/*
* Step 2:
* If no socket ...
@@ -881,7 +916,7 @@ discard_and_relse:
goto discard_it;
}
-static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
.queue_xmit = ip_queue_xmit,
.send_check = dccp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -917,8 +952,6 @@ static struct timewait_sock_ops dccp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct inet_timewait_sock),
};
-DEFINE_PROTO_INUSE(dccp_v4)
-
static struct proto dccp_v4_prot = {
.name = "DCCP",
.owner = THIS_MODULE,
@@ -941,20 +974,22 @@ static struct proto dccp_v4_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
.rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops,
- .hashinfo = &dccp_hashinfo,
+ .h.hashinfo = &dccp_hashinfo,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_dccp_setsockopt,
.compat_getsockopt = compat_dccp_getsockopt,
#endif
- REF_PROTO_INUSE(dccp_v4)
};
-static struct net_protocol dccp_v4_protocol = {
+static const struct net_protocol dccp_v4_protocol = {
.handler = dccp_v4_rcv,
.err_handler = dccp_v4_err,
.no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
static const struct proto_ops inet_dccp_ops = {
@@ -989,11 +1024,28 @@ static struct inet_protosw dccp_v4_protosw = {
.protocol = IPPROTO_DCCP,
.prot = &dccp_v4_prot,
.ops = &inet_dccp_ops,
- .capability = -1,
- .no_check = 0,
.flags = INET_PROTOSW_ICSK,
};
+static int __net_init dccp_v4_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v4_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
+}
+
+static struct pernet_operations dccp_v4_ops = {
+ .init = dccp_v4_init_net,
+ .exit = dccp_v4_exit_net,
+};
+
static int __init dccp_v4_init(void)
{
int err = proto_register(&dccp_v4_prot, 1);
@@ -1007,13 +1059,12 @@ static int __init dccp_v4_init(void)
inet_register_protosw(&dccp_v4_protosw);
- err = inet_csk_ctl_sock_create(&dccp_v4_ctl_socket, PF_INET,
- SOCK_DCCP, IPPROTO_DCCP);
+ err = register_pernet_subsys(&dccp_v4_ops);
if (err)
- goto out_unregister_protosw;
+ goto out_destroy_ctl_sock;
out:
return err;
-out_unregister_protosw:
+out_destroy_ctl_sock:
inet_unregister_protosw(&dccp_v4_protosw);
inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
out_proto_unregister:
@@ -1023,6 +1074,7 @@ out_proto_unregister:
static void __exit dccp_v4_exit(void)
{
+ unregister_pernet_subsys(&dccp_v4_ops);
inet_unregister_protosw(&dccp_v4_protosw);
inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
proto_unregister(&dccp_v4_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 490333d47c7..4db3c2a1679 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/xfrm.h>
#include <net/addrconf.h>
@@ -28,16 +29,16 @@
#include <net/transp_v6.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "dccp.h"
#include "ipv6.h"
#include "feat.h"
-/* Socket used for sending RSTs and ACKs */
-static struct socket *dccp_v6_ctl_socket;
+/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
static void dccp_v6_hash(struct sock *sk)
{
@@ -47,36 +48,29 @@ static void dccp_v6_hash(struct sock *sk)
return;
}
local_bh_disable();
- __inet6_hash(sk);
+ __inet6_hash(sk, NULL);
local_bh_enable();
}
}
/* add pseudo-header to DCCP checksum stored in skb->csum */
static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
- struct in6_addr *saddr,
- struct in6_addr *daddr)
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
{
return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
}
-static inline void dccp_v6_send_check(struct sock *sk, int unused_value,
- struct sk_buff *skb)
+static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_hdr *dh = dccp_hdr(skb);
dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr);
+ dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
}
-static inline __u32 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport )
-{
- return secure_tcpv6_sequence_number(saddr, daddr, sport, dport);
-}
-
-static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
+static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
{
return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
@@ -86,20 +80,31 @@ static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
}
static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __be32 info)
+ u8 type, u8 code, int offset, __be32 info)
{
- struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct dccp_sock *dp;
struct ipv6_pinfo *np;
struct sock *sk;
int err;
__u64 seq;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
- sk = inet6_lookup(&init_net, &dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
+ sk = inet6_lookup(net, &dccp_hashinfo,
+ &hdr->daddr, dh->dccph_dport,
&hdr->saddr, dh->dccph_sport, inet6_iif(skb));
if (sk == NULL) {
- ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
@@ -110,64 +115,51 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
+ dp = dccp_sk(sk);
+ seq = dccp_hdr_seq(dh);
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
np = inet6_sk(sk);
+ if (type == NDISC_REDIRECT) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ goto out;
+ }
+
if (type == ICMPV6_PKT_TOOBIG) {
struct dst_entry *dst = NULL;
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+
if (sock_owned_by_user(sk))
goto out;
if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
goto out;
- /* icmp should have updated the destination cache entry */
- dst = __sk_dst_check(sk, np->dst_cookie);
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
-
- /* BUGGG_FUTURE: Again, it is not clear how
- to handle rthdr case. Ignore this complexity
- for now.
- */
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
- security_sk_classify_flow(sk, &fl);
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- err = xfrm_lookup(&dst, &fl, sk, 0);
- if (err < 0) {
- sk->sk_err_soft = -err;
- goto out;
- }
- } else
- dst_hold(dst);
+ dst = inet6_csk_update_pmtu(sk, ntohl(info));
+ if (!dst)
+ goto out;
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
dccp_sync_mss(sk, dst_mtu(dst));
- } /* else let the usual retransmit timer handle it */
- dst_release(dst);
goto out;
}
icmpv6_err_convert(type, code, &err);
- seq = dccp_hdr_seq(dh);
/* Might be for an request_sock */
switch (sk->sk_state) {
struct request_sock *req, **prev;
@@ -185,10 +177,11 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
- BUG_TRAP(req->sk == NULL);
+ WARN_ON(req->sk != NULL);
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -224,48 +217,34 @@ out:
}
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
- struct ipv6_txoptions *opt = NULL;
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
int err = -1;
+ struct dst_entry *dst;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = ireq6->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_req_classify_flow(req, &fl);
-
- if (dst == NULL) {
- opt = np->opt;
-
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = ireq->ir_iif;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = xfrm_lookup(&dst, &fl, sk, 0);
- if (err < 0)
- goto done;
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto done;
}
skb = dccp_make_response(sk, dst, req);
@@ -273,31 +252,32 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
struct dccp_hdr *dh = dccp_hdr(skb);
dh->dccph_checksum = dccp_v6_csum_finish(skb,
- &ireq6->loc_addr,
- &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt, 0);
+ &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
err = net_xmit_eval(err);
}
done:
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
dst_release(dst);
return err;
}
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
- if (inet6_rsk(req)->pktopts != NULL)
- kfree_skb(inet6_rsk(req)->pktopts);
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree_skb(inet_rsk(req)->pktopts);
}
static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
{
- struct ipv6hdr *rxip6h;
+ const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
- struct flowi fl;
+ struct flowi6 fl6;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v6_ctl_sk;
+ struct dst_entry *dst;
if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
return;
@@ -305,7 +285,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
if (!ipv6_unicast_destination(rxskb))
return;
- skb = dccp_ctl_make_reset(dccp_v6_ctl_socket, rxskb);
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
if (skb == NULL)
return;
@@ -313,24 +293,24 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
&rxip6h->daddr);
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxip6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxip6h->daddr);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = rxip6h->saddr;
+ fl6.saddr = rxip6h->daddr;
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dccp_hdr(skb)->dccph_dport;
- fl.fl_ip_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, &fl);
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.flowi6_oif = inet6_iif(rxskb);
+ fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+ fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+ security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
- if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(dccp_v6_ctl_socket->sk, skb, &fl, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- return;
- }
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(skb, dst);
+ ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
}
kfree_skb(skb);
@@ -343,6 +323,7 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.send_ack = dccp_reqsk_send_ack,
.destructor = dccp_v6_reqsk_destructor,
.send_reset = dccp_v6_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
};
static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
@@ -360,7 +341,7 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
if (req != NULL)
return dccp_check_req(sk, skb, req, prev);
- nsk = __inet6_lookup_established(&init_net, &dccp_hashinfo,
+ nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
&iph->saddr, dh->dccph_sport,
&iph->daddr, ntohs(dh->dccph_dport),
inet6_iif(skb));
@@ -380,7 +361,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct request_sock *req;
struct dccp_request_sock *dreq;
- struct inet6_request_sock *ireq6;
+ struct inet_request_sock *ireq;
struct ipv6_pinfo *np = inet6_sk(sk);
const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -409,7 +390,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (req == NULL)
goto drop;
- dccp_reqsk_init(req, skb);
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
dreq = dccp_rsk(req);
if (dccp_parse_options(sk, dreq, skb))
@@ -418,37 +400,37 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- ireq6 = inet6_rsk(req);
- ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
- ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
- ireq6->pktopts = NULL;
+ ireq = inet_rsk(req);
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users);
- ireq6->pktopts = skb;
+ ireq->pktopts = skb;
}
- ireq6->iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq6->iif = inet6_iif(skb);
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = inet6_iif(skb);
/*
* Step 3: Process LISTEN state
*
* Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
*
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
*/
dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
dreq->dreq_iss = dccp_v6_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v6_send_response(sk, req, NULL))
+ if (dccp_v6_send_response(sk, req))
goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -466,13 +448,11 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
struct request_sock *req,
struct dst_entry *dst)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct inet_sock *newinet;
- struct dccp_sock *newdp;
struct dccp6_sock *newdp6;
struct sock *newsk;
- struct ipv6_txoptions *opt;
if (skb->protocol == htons(ETH_P_IP)) {
/*
@@ -483,20 +463,17 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return NULL;
newdp6 = (struct dccp6_sock *)newsk;
- newdp = dccp_sk(newsk);
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
- newinet->daddr);
+ ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr);
- ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
- newinet->saddr);
+ ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
- ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+ newsk->sk_v6_rcv_saddr = newnp->saddr;
inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
@@ -520,44 +497,32 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return newsk;
}
- opt = np->opt;
if (sk_acceptq_is_full(sk))
goto out_overflow;
if (dst == NULL) {
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- if (opt != NULL && opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
- security_sk_classify_flow(sk, &fl);
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst))
goto out;
}
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto out;
+ goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -571,31 +536,30 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newdp6 = (struct dccp6_sock *)newsk;
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
- newdp = dccp_sk(newsk);
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
- newsk->sk_bound_dev_if = ireq6->iif;
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newnp->saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
/* Now IPv6 options...
First: no IPv4 options.
*/
- newinet->opt = NULL;
+ newinet->inet_opt = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (ireq6->pktopts != NULL) {
- newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
- kfree_skb(ireq6->pktopts);
- ireq6->pktopts = NULL;
+ if (ireq->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
if (newnp->pktoptions)
skb_set_owner_r(newnp->pktoptions, newsk);
}
@@ -609,11 +573,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- if (opt != NULL) {
- newnp->opt = ipv6_dup_options(newsk, opt);
- if (opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- }
+ if (np->opt != NULL)
+ newnp->opt = ipv6_dup_options(newsk, np->opt);
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newnp->opt != NULL)
@@ -622,20 +583,24 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
dccp_sync_mss(newsk, dst_mtu(dst));
- newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+ newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+ newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
- __inet6_hash(newsk);
- inet_inherit_port(sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto out;
+ }
+ __inet6_hash(newsk, NULL);
return newsk;
out_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-out:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
- if (opt != NULL && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
dst_release(dst);
+out:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
}
@@ -791,10 +756,8 @@ static int dccp_v6_rcv(struct sk_buff *skb)
/* Step 2:
* Look up flow ID in table and get corresponding socket */
- sk = __inet6_lookup(&init_net, &dccp_hashinfo, &ipv6_hdr(skb)->saddr,
- dh->dccph_sport,
- &ipv6_hdr(skb)->daddr, ntohs(dh->dccph_dport),
- inet6_iif(skb));
+ sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport);
/*
* Step 2:
* If no socket ...
@@ -867,8 +830,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p = NULL, final;
- struct flowi fl;
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
int err;
@@ -881,17 +844,16 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+ fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
fl6_sock_release(flowlabel);
}
}
@@ -924,8 +886,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
return -EINVAL;
}
- ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
/*
* DCCP over IPv4
@@ -952,58 +914,40 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
icsk->icsk_af_ops = &dccp_ipv6_af_ops;
sk->sk_backlog_rcv = dccp_v6_do_rcv;
goto failure;
- } else {
- ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
- inet->saddr);
- ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
- inet->rcv_saddr);
}
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+ ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr);
return err;
}
- if (!ipv6_addr_any(&np->rcv_saddr))
- saddr = &np->rcv_saddr;
-
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->sport;
- security_sk_classify_flow(sk, &fl);
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
- if (np->opt != NULL && np->opt->srcrt != NULL) {
- const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto failure;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto failure;
}
if (saddr == NULL) {
- saddr = &fl.fl6_src;
- ipv6_addr_copy(&np->rcv_saddr, saddr);
+ saddr = &fl6.saddr;
+ sk->sk_v6_rcv_saddr = *saddr;
}
/* set the source address */
- ipv6_addr_copy(&np->saddr, saddr);
- inet->rcv_saddr = LOOPBACK4_IPV6;
+ np->saddr = *saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
__ip6_dst_store(sk, dst, NULL, NULL);
@@ -1012,7 +956,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
np->opt->opt_nflen);
- inet->dport = usin->sin6_port;
+ inet->inet_dport = usin->sin6_port;
dccp_set_state(sk, DCCP_REQUESTING);
err = inet6_hash_connect(&dccp_death_row, sk);
@@ -1020,8 +964,9 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
goto late_failure;
dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->sport, inet->dport);
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport);
err = dccp_connect(sk);
if (err)
goto late_failure;
@@ -1032,12 +977,12 @@ late_failure:
dccp_set_state(sk, DCCP_CLOSED);
__sk_dst_reset(sk);
failure:
- inet->dport = 0;
+ inet->inet_dport = 0;
sk->sk_route_caps = 0;
return err;
}
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
.queue_xmit = inet6_csk_xmit,
.send_check = dccp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
@@ -1058,7 +1003,7 @@ static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
/*
* DCCP over IPv4 via INET6 API
*/
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = dccp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1092,18 +1037,16 @@ static int dccp_v6_init_sock(struct sock *sk)
return err;
}
-static int dccp_v6_destroy_sock(struct sock *sk)
+static void dccp_v6_destroy_sock(struct sock *sk)
{
dccp_destroy_sock(sk);
- return inet6_destroy_sock(sk);
+ inet6_destroy_sock(sk);
}
static struct timewait_sock_ops dccp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct dccp6_timewait_sock),
};
-DEFINE_PROTO_INUSE(dccp_v6)
-
static struct proto dccp_v6_prot = {
.name = "DCCPv6",
.owner = THIS_MODULE,
@@ -1126,23 +1069,23 @@ static struct proto dccp_v6_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
.rsk_prot = &dccp6_request_sock_ops,
.twsk_prot = &dccp6_timewait_sock_ops,
- .hashinfo = &dccp_hashinfo,
+ .h.hashinfo = &dccp_hashinfo,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_dccp_setsockopt,
.compat_getsockopt = compat_dccp_getsockopt,
#endif
- REF_PROTO_INUSE(dccp_v6)
};
-static struct inet6_protocol dccp_v6_protocol = {
+static const struct inet6_protocol dccp_v6_protocol = {
.handler = dccp_v6_rcv,
.err_handler = dccp_v6_err,
.flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
};
-static struct proto_ops inet6_dccp_ops = {
+static const struct proto_ops inet6_dccp_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -1172,10 +1115,28 @@ static struct inet_protosw dccp_v6_protosw = {
.protocol = IPPROTO_DCCP,
.prot = &dccp_v6_prot,
.ops = &inet6_dccp_ops,
- .capability = -1,
.flags = INET_PROTOSW_ICSK,
};
+static int __net_init dccp_v6_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v6_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
+}
+
+static struct pernet_operations dccp_v6_ops = {
+ .init = dccp_v6_init_net,
+ .exit = dccp_v6_exit_net,
+};
+
static int __init dccp_v6_init(void)
{
int err = proto_register(&dccp_v6_prot, 1);
@@ -1189,13 +1150,13 @@ static int __init dccp_v6_init(void)
inet6_register_protosw(&dccp_v6_protosw);
- err = inet_csk_ctl_sock_create(&dccp_v6_ctl_socket, PF_INET6,
- SOCK_DCCP, IPPROTO_DCCP);
+ err = register_pernet_subsys(&dccp_v6_ops);
if (err != 0)
- goto out_unregister_protosw;
+ goto out_destroy_ctl_sock;
out:
return err;
-out_unregister_protosw:
+
+out_destroy_ctl_sock:
inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
inet6_unregister_protosw(&dccp_v6_protosw);
out_unregister_proto:
@@ -1205,6 +1166,7 @@ out_unregister_proto:
static void __exit dccp_v6_exit(void)
{
+ unregister_pernet_subsys(&dccp_v6_ops);
inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
inet6_unregister_protosw(&dccp_v6_protosw);
proto_unregister(&dccp_v6_prot);
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
index 6eef81fdbe5..af259e15e7f 100644
--- a/net/dccp/ipv6.h
+++ b/net/dccp/ipv6.h
@@ -25,12 +25,10 @@ struct dccp6_sock {
struct dccp6_request_sock {
struct dccp_request_sock dccp;
- struct inet6_request_sock inet6;
};
struct dccp6_timewait_sock {
struct inet_timewait_sock inet;
- struct inet6_timewait_sock tw6;
};
#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 027d1814e1a..c69eb9c4fbb 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -11,6 +11,7 @@
*/
#include <linux/dccp.h>
+#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/timer.h>
@@ -42,16 +43,6 @@ struct inet_timewait_death_row dccp_death_row = {
EXPORT_SYMBOL_GPL(dccp_death_row);
-void dccp_minisock_init(struct dccp_minisock *dmsk)
-{
- dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
- dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid;
- dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid;
- dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio;
- dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
- dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count;
-}
-
void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
@@ -62,15 +53,12 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
const struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -109,13 +97,12 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
* (* Generate a new socket and switch to that socket *)
* Set S := new socket for this port pair
*/
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
- const struct dccp_request_sock *dreq = dccp_rsk(req);
+ struct dccp_request_sock *dreq = dccp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct dccp_sock *newdp = dccp_sk(newsk);
- struct dccp_minisock *newdmsk = dccp_msk(newsk);
newdp->dccps_role = DCCP_ROLE_SERVER;
newdp->dccps_hc_rx_ackvec = NULL;
@@ -125,65 +112,34 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
- if (dccp_feat_clone(sk, newsk))
- goto out_free;
-
- if (newdmsk->dccpms_send_ack_vector) {
- newdp->dccps_hc_rx_ackvec =
- dccp_ackvec_alloc(GFP_ATOMIC);
- if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
- goto out_free;
- }
-
- newdp->dccps_hc_rx_ccid =
- ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
- newsk, GFP_ATOMIC);
- newdp->dccps_hc_tx_ccid =
- ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
- newsk, GFP_ATOMIC);
- if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
- newdp->dccps_hc_tx_ccid == NULL)) {
- dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
- ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
- ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
-out_free:
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
-
+ INIT_LIST_HEAD(&newdp->dccps_featneg);
/*
* Step 3: Process LISTEN state
*
* Choose S.ISS (initial seqno) or set from Init Cookies
* Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Set S.ISR, S.GSR from packet (or Init Cookies)
+ *
+ * Setting AWL/AWH and SWL/SWH happens as part of the feature
+ * activation below, as these windows all depend on the local
+ * and remote Sequence Window feature values (7.5.2).
*/
-
- /* See dccp_v4_conn_request */
- newdmsk->dccpms_sequence_window = req->rcv_wnd;
-
- newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
newdp->dccps_iss = dreq->dreq_iss;
- dccp_update_gss(newsk, dreq->dreq_iss);
+ newdp->dccps_gss = dreq->dreq_gss;
+ newdp->dccps_gar = newdp->dccps_iss;
+ newdp->dccps_isr = dreq->dreq_isr;
+ newdp->dccps_gsr = dreq->dreq_gsr;
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
+ * Activate features: initialise CCIDs, sequence windows etc.
*/
- dccp_set_seqno(&newdp->dccps_swl,
- max48(newdp->dccps_swl, newdp->dccps_isr));
- dccp_set_seqno(&newdp->dccps_awl,
- max48(newdp->dccps_awl, newdp->dccps_iss));
-
+ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
dccp_init_xmit_timers(newsk);
DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -207,16 +163,15 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) {
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
dccp_pr_debug("Retransmitted REQUEST\n");
- dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
/*
* Send another RESPONSE packet
* To protect against Request floods, increment retrans
* counter (backoff, monitored by dccp_response_timer).
*/
- req->retrans++;
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ inet_rtx_syn_ack(sk, req);
}
/* Network Duplicate, discard packet */
return NULL;
@@ -229,12 +184,14 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
goto drop;
/* Invalid ACK */
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) {
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dreq->dreq_iss, dreq->dreq_gss)) {
dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu\n",
+ "dreq_iss=%llu, dreq_gss=%llu\n",
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dreq->dreq_iss);
+ (unsigned long long) dreq->dreq_iss,
+ (unsigned long long) dreq->dreq_gss);
goto drop;
}
@@ -280,13 +237,13 @@ int dccp_child_process(struct sock *parent, struct sock *child,
/* Wakeup parent, send SIGIO */
if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
- sk_add_backlog(child, skb);
+ __sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
@@ -296,21 +253,26 @@ int dccp_child_process(struct sock *parent, struct sock *child,
EXPORT_SYMBOL_GPL(dccp_child_process);
-void dccp_reqsk_send_ack(struct sk_buff *skb, struct request_sock *rsk)
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk)
{
DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
}
EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
-void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
+int dccp_reqsk_init(struct request_sock *req,
+ struct dccp_sock const *dp, struct sk_buff const *skb)
{
struct dccp_request_sock *dreq = dccp_rsk(req);
- inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->acked = 0;
- req->rcv_wnd = sysctl_dccp_feat_sequence_window;
- dreq->dreq_timestamp_echo = 0;
+ inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
+ inet_rsk(req)->acked = 0;
+ dreq->dreq_timestamp_echo = 0;
+
+ /* inherit feature negotiation options from listening socket */
+ return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
}
EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index d2a84a2fece..9bce31886bd 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,23 +23,20 @@
#include "dccp.h"
#include "feat.h"
-int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
-int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO;
-int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
-int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT;
-
-static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+u64 dccp_decode_value_var(const u8 *bf, const u8 len)
{
- u32 value = 0;
+ u64 value = 0;
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ value += ((u64)*bf++) << 40;
+ if (len > 4)
+ value += ((u64)*bf++) << 32;
if (len > 3)
- value += *bf++ << 24;
+ value += ((u64)*bf++) << 24;
if (len > 2)
- value += *bf++ << 16;
+ value += ((u64)*bf++) << 16;
if (len > 1)
- value += *bf++ << 8;
+ value += ((u64)*bf++) << 8;
if (len > 0)
value += *bf;
@@ -57,14 +54,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
struct dccp_sock *dp = dccp_sk(sk);
const struct dccp_hdr *dh = dccp_hdr(skb);
const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
unsigned char *opt_ptr = options;
const unsigned char *opt_end = (unsigned char *)dh +
(dh->dccph_doff * 4);
struct dccp_options_received *opt_recv = &dp->dccps_options_received;
unsigned char opt, len;
- unsigned char *value;
+ unsigned char *uninitialized_var(value);
u32 elapsed_time;
__be32 opt_val;
int rc;
@@ -81,11 +77,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
/* Check if this isn't a single byte option */
if (opt > DCCPO_MAX_RESERVED) {
if (opt_ptr == opt_end)
- goto out_invalid_option;
+ goto out_nonsensical_length;
len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
+ if (len < 2)
+ goto out_nonsensical_length;
/*
* Remove the type and len fields, leaving
* just the value size
@@ -95,21 +91,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
opt_ptr += len;
if (opt_ptr > opt_end)
- goto out_invalid_option;
+ goto out_nonsensical_length;
}
/*
- * CCID-Specific Options (from RFC 4340, sec. 10.3):
- *
- * Option numbers 128 through 191 are for options sent from the
- * HC-Sender to the HC-Receiver; option numbers 192 through 255
- * are for options sent from the HC-Receiver to the HC-Sender.
- *
* CCID-specific options are ignored during connection setup, as
* negotiation may still be in progress (see RFC 4340, 10.3).
- *
+ * The same applies to Ack Vectors, as these depend on the CCID.
*/
- if (dreq != NULL && opt >= 128)
+ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+ opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
goto ignore_option;
switch (opt) {
@@ -122,50 +113,22 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
mandatory = 1;
break;
case DCCPO_NDP_COUNT:
- if (len > 3)
+ if (len > 6)
goto out_invalid_option;
opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
- dccp_pr_debug("%s rx opt: NDP count=%d\n", dccp_role(sk),
- opt_recv->dccpor_ndp);
- break;
- case DCCPO_CHANGE_L:
- /* fall through */
- case DCCPO_CHANGE_R:
- if (pkt_type == DCCP_PKT_DATA)
- break;
- if (len < 2)
- goto out_invalid_option;
- rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
- len - 1);
- /*
- * When there is a change error, change_recv is
- * responsible for dealing with it. i.e. reply with an
- * empty confirm.
- * If the change was mandatory, then we need to die.
- */
- if (rc && mandatory)
- goto out_invalid_option;
- break;
- case DCCPO_CONFIRM_L:
- /* fall through */
- case DCCPO_CONFIRM_R:
- if (pkt_type == DCCP_PKT_DATA)
- break;
- if (len < 2) /* FIXME this disallows empty confirm */
- goto out_invalid_option;
- if (dccp_feat_confirm_recv(sk, opt, *value,
- value + 1, len - 1))
- goto out_invalid_option;
+ dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
+ (unsigned long long)opt_recv->dccpor_ndp);
break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
+ if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
break;
-
- if (dccp_msk(sk)->dccpms_send_ack_vector &&
- dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
+ if (len == 0)
goto out_invalid_option;
+ rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
+ *value, value + 1, len - 1);
+ if (rc)
+ goto out_featneg_failed;
break;
case DCCPO_TIMESTAMP:
if (len != 4)
@@ -193,6 +156,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_role(sk), ntohl(opt_val),
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /* schedule an Ack in case this sender is quiescent */
+ inet_csk_schedule_ack(sk);
break;
case DCCPO_TIMESTAMP_ECHO:
if (len != 4 && len != 6 && len != 8)
@@ -249,23 +214,25 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
dccp_role(sk), elapsed_time);
break;
- case 128 ... 191: {
- const u16 idx = value - options;
-
+ case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
- case 192 ... 255: {
- const u16 idx = value - options;
-
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ break;
+ /*
+ * Ack vectors are processed by the TX CCID if it is
+ * interested. The RX CCID need not parse Ack Vectors,
+ * since it is only interested in clearing old state.
+ * Fall through.
+ */
+ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
default:
DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -281,20 +248,30 @@ ignore_option:
if (mandatory)
goto out_invalid_option;
+out_nonsensical_length:
+ /* RFC 4340, 5.8: ignore option and all remaining option space */
return 0;
out_invalid_option:
DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
- DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
+ rc = DCCP_RESET_CODE_OPTION_ERROR;
+out_featneg_failed:
+ DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
+ DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
return -1;
}
EXPORT_SYMBOL_GPL(dccp_parse_options);
-static void dccp_encode_value_var(const u32 value, unsigned char *to,
- const unsigned int len)
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
{
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ *to++ = (value & 0xFF0000000000ull) >> 40;
+ if (len > 4)
+ *to++ = (value & 0xFF00000000ull) >> 32;
if (len > 3)
*to++ = (value & 0xFF000000) >> 24;
if (len > 2)
@@ -305,14 +282,15 @@ static void dccp_encode_value_var(const u32 value, unsigned char *to,
*to++ = (value & 0xFF);
}
-static inline int dccp_ndp_len(const int ndp)
+static inline u8 dccp_ndp_len(const u64 ndp)
{
- return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+ if (likely(ndp <= 0xFF))
+ return 1;
+ return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
}
-int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- const unsigned char option,
- const void *value, const unsigned char len)
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+ const void *value, const unsigned char len)
{
unsigned char *to;
@@ -334,7 +312,7 @@ EXPORT_SYMBOL_GPL(dccp_insert_option);
static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- int ndp = dp->dccps_ndp_count;
+ u64 ndp = dp->dccps_ndp_count;
if (dccp_non_data_packet(skb))
++dp->dccps_ndp_count;
@@ -365,49 +343,15 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
}
-int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb,
- u32 elapsed_time)
-{
- const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- const int len = 2 + elapsed_time_len;
- unsigned char *to;
-
- if (elapsed_time_len == 0)
- return 0;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_ELAPSED_TIME;
- *to++ = len;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
-
-int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
{
__be32 now = htonl(dccp_timestamp());
/* yes this will overflow but that is the point as we want a
* 10 usec 32 bit timer which mean it wraps every 11.9 hours */
- return dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+ return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
}
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
struct dccp_request_sock *dreq,
struct sk_buff *skb)
@@ -452,92 +396,141 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
return 0;
}
-static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
- u8 *val, u8 len)
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
{
- u8 *to;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const u16 buflen = dccp_ackvec_buflen(av);
+ /* Figure out how many options do we need to represent the ackvec */
+ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+ u16 len = buflen + 2 * nr_opts;
+ u8 i, nonce = 0;
+ const unsigned char *tail, *from;
+ unsigned char *to;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("packet too small for feature %d option!\n", feat);
+ if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+ dccp_packet_name(dcb->dccpd_type));
return -1;
}
+ /*
+ * Since Ack Vectors are variable-length, we can not always predict
+ * their size. To catch exception cases where the space is running out
+ * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+ */
+ if (len > DCCPAV_MIN_OPTLEN &&
+ len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+ DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+ "MPS=%u ==> reduce payload size?\n", len, skb->len,
+ dcb->dccpd_opt_len, dp->dccps_mss_cache);
+ dp->dccps_sync_scheduled = 1;
+ return 0;
+ }
+ dcb->dccpd_opt_len += len;
- DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
+ to = skb_push(skb, len);
+ len = buflen;
+ from = av->av_buf + av->av_buf_head;
+ tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
- to = skb_push(skb, len + 3);
- *to++ = type;
- *to++ = len + 3;
- *to++ = feat;
+ for (i = 0; i < nr_opts; ++i) {
+ int copylen = len;
- if (len)
- memcpy(to, val, len);
+ if (len > DCCP_SINGLE_OPT_MAXLEN)
+ copylen = DCCP_SINGLE_OPT_MAXLEN;
- dccp_pr_debug("%s(%s (%d), ...), length %d\n",
- dccp_feat_typename(type),
- dccp_feat_name(feat), feat, len);
+ /*
+ * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+ * its type; ack_nonce is the sum of all individual buf_nonce's.
+ */
+ nonce ^= av->av_buf_nonce[i];
+
+ *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+ *to++ = copylen + 2;
+
+ /* Check if buf_head wraps */
+ if (from + copylen > tail) {
+ const u16 tailsize = tail - from;
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ copylen -= tailsize;
+ from = av->av_buf;
+ }
+
+ memcpy(to, from, copylen);
+ from += copylen;
+ to += copylen;
+ len -= copylen;
+ }
+ /*
+ * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+ */
+ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
+ return -ENOBUFS;
return 0;
}
-static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_insert_option_mandatory - Mandatory option (5.8.2)
+ * Note that since we are using skb_push, this function needs to be called
+ * _after_ inserting the option it is supposed to influence (stack order).
+ */
+int dccp_insert_option_mandatory(struct sk_buff *skb)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
- struct dccp_opt_pend *opt, *next;
- int change = 0;
-
- /* confirm any options [NN opts] */
- list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
- dccp_insert_feat_opt(skb, opt->dccpop_type,
- opt->dccpop_feat, opt->dccpop_val,
- opt->dccpop_len);
- /* fear empty confirms */
- if (opt->dccpop_val)
- kfree(opt->dccpop_val);
- kfree(opt);
- }
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
-
- /* see which features we need to send */
- list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
- /* see if we need to send any confirm */
- if (opt->dccpop_sc) {
- dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
- opt->dccpop_feat,
- opt->dccpop_sc->dccpoc_val,
- opt->dccpop_sc->dccpoc_len);
-
- BUG_ON(!opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc->dccpoc_val);
- kfree(opt->dccpop_sc);
- opt->dccpop_sc = NULL;
- }
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
+ return -1;
- /* any option not confirmed, re-send it */
- if (!opt->dccpop_conf) {
- dccp_insert_feat_opt(skb, opt->dccpop_type,
- opt->dccpop_feat, opt->dccpop_val,
- opt->dccpop_len);
- change++;
- }
+ DCCP_SKB_CB(skb)->dccpd_opt_len++;
+ *skb_push(skb, 1) = DCCPO_MANDATORY;
+ return 0;
+}
+
+/**
+ * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
+ * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
+ * @feat: one out of %dccp_feature_numbers
+ * @val: NN value or SP array (preferred element first) to copy
+ * @len: true length of @val in bytes (excluding first element repetition)
+ * @repeat_first: whether to copy the first element of @val twice
+ *
+ * The last argument is used to construct Confirm options, where the preferred
+ * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
+ * lists are kept such that the preferred entry is always first, so we only need
+ * to copy twice, and avoid the overhead of cloning into a bigger array.
+ */
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+ u8 *val, u8 len, bool repeat_first)
+{
+ u8 tot_len, *to;
+
+ /* take the `Feature' field and possible repetition into account */
+ if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
+ DCCP_WARN("length %u for feature %u too large\n", len, feat);
+ return -1;
}
- /* Retransmit timer.
- * If this is the master listening sock, we don't set a timer on it. It
- * should be fine because if the dude doesn't receive our RESPONSE
- * [which will contain the CHANGE] he will send another REQUEST which
- * will "retrnasmit" the change.
- */
- if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
- dccp_pr_debug("reset feat negotiation timer %p\n", sk);
+ if (unlikely(val == NULL || len == 0))
+ len = repeat_first = false;
+ tot_len = 3 + repeat_first + len;
- /* XXX don't reset the timer on re-transmissions. I.e. reset it
- * only when sending new stuff i guess. Currently the timer
- * never backs off because on re-transmission it just resets it!
- */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("packet too small for feature %d option!\n", feat);
+ return -1;
}
+ DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
+ to = skb_push(skb, tot_len);
+ *to++ = type;
+ *to++ = tot_len;
+ *to++ = feat;
+
+ if (repeat_first)
+ *to++ = *val;
+ if (len)
+ memcpy(to, val, len);
return 0;
}
@@ -556,19 +549,30 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
- if (dmsk->dccpms_send_ndp_count &&
- dccp_insert_option_ndp(sk, skb))
+ if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
return -1;
- if (!dccp_packet_without_ack(skb)) {
- if (dmsk->dccpms_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
- dccp_insert_option_ackvec(sk, skb))
+ if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
+
+ /* Feature Negotiation */
+ if (dccp_feat_insert_opts(dp, NULL, skb))
return -1;
+
+ if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
+ /*
+ * Obtain RTT sample from Request/Response exchange.
+ * This is currently used for TFRC initialisation.
+ */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
+ } else if (dccp_ackvec_pending(sk) &&
+ dccp_insert_option_ackvec(sk, skb)) {
+ return -1;
+ }
}
if (dp->dccps_hc_rx_insert_options) {
@@ -577,21 +581,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
dp->dccps_hc_rx_insert_options = 0;
}
- /* Feature negotiation */
- /* Data packets can't do feat negotiation */
- if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
- DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
- dccp_insert_options_feat(sk, skb))
- return -1;
-
- /*
- * Obtain RTT sample from Request/Response exchange.
- * This is currently used in CCID 3 initialisation.
- */
- if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
- dccp_insert_option_timestamp(sk, skb))
- return -1;
-
if (dp->dccps_timestamp_echo != 0 &&
dccp_insert_option_timestamp_echo(dp, NULL, skb))
return -1;
@@ -604,6 +593,13 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
{
DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+ if (dccp_feat_insert_opts(NULL, dreq, skb))
+ return -1;
+
+ /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
if (dreq->dreq_timestamp_echo != 0 &&
dccp_insert_option_timestamp_echo(NULL, dreq, skb))
return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 3b763db3d86..0248e8a3460 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -13,6 +13,7 @@
#include <linux/dccp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/inet_sock.h>
#include <net/sock.h>
@@ -26,11 +27,13 @@ static inline void dccp_event_ack_sent(struct sock *sk)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
+static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
skb_set_owner_w(skb, sk);
WARN_ON(sk->sk_send_head);
sk->sk_send_head = skb;
+ return skb_clone(sk->sk_send_head, gfp_any());
}
/*
@@ -42,7 +45,7 @@ static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
- const struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -53,8 +56,11 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
dccp_packet_hdr_len(dcb->dccpd_type);
int err, set_ack = 1;
u64 ackno = dp->dccps_gsr;
-
- dccp_inc_seqno(&dp->dccps_gss);
+ /*
+ * Increment GSS here already in case the option code needs it.
+ * Update GSS for real only if option processing below succeeds.
+ */
+ dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
switch (dcb->dccpd_type) {
case DCCP_PKT_DATA:
@@ -66,6 +72,9 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
case DCCP_PKT_REQUEST:
set_ack = 0;
+ /* Use ISS on the first (non-retransmitted) Request. */
+ if (icsk->icsk_retransmits == 0)
+ dcb->dccpd_seq = dp->dccps_iss;
/* fall through */
case DCCP_PKT_SYNC:
@@ -84,8 +93,6 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- dcb->dccpd_seq = dp->dccps_gss;
-
if (dccp_insert_options(sk, skb)) {
kfree_skb(skb);
return -EPROTO;
@@ -95,15 +102,15 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
/* Build DCCP header and checksum it. */
dh = dccp_zeroed_hdr(skb, dccp_header_size);
dh->dccph_type = dcb->dccpd_type;
- dh->dccph_sport = inet->sport;
- dh->dccph_dport = inet->dport;
+ dh->dccph_sport = inet->inet_sport;
+ dh->dccph_dport = inet->inet_dport;
dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
dh->dccph_ccval = dcb->dccpd_ccval;
dh->dccph_cscov = dp->dccps_pcslen;
/* XXX For now we're using only 48 bits sequence numbers */
dh->dccph_x = 1;
- dp->dccps_awh = dp->dccps_gss;
+ dccp_update_gss(sk, dcb->dccpd_seq);
dccp_hdr_set_seq(dh, dp->dccps_gss);
if (set_ack)
dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
@@ -112,6 +119,11 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
case DCCP_PKT_REQUEST:
dccp_hdr_request(skb)->dccph_req_service =
dp->dccps_service;
+ /*
+ * Limit Ack window to ISS <= P.ackno <= GSS, so that
+ * only Responses to Requests we sent are considered.
+ */
+ dp->dccps_awl = dp->dccps_iss;
break;
case DCCP_PKT_RESET:
dccp_hdr_reset(skb)->dccph_reset_code =
@@ -119,22 +131,21 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- icsk->icsk_af_ops->send_check(sk, 0, skb);
+ icsk->icsk_af_ops->send_check(sk, skb);
if (set_ack)
dccp_event_ack_sent(sk);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = icsk->icsk_af_ops->queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
return net_xmit_eval(err);
}
return -ENOBUFS;
}
/**
- * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits
+ * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
* We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
* since the RX CCID is restricted to feedback packets (Acks), which are small
* in comparison with the data traffic. A value of 0 means "no current CCMPS".
@@ -153,21 +164,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
u32 ccmps = dccp_determine_ccmps(dp);
- int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+ u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
/* Account for header lengths and IPv4/v6 option overhead */
cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
/*
- * FIXME: this should come from the CCID infrastructure, where, say,
- * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
- * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
- * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
- * make it a multiple of 4
+ * Leave enough headroom for common DCCP header options.
+ * This only considers options which may appear on DCCP-Data packets, as
+ * per table 3 in RFC 4340, 5.8. When running out of space for other
+ * options (eg. Ack Vector which can take up to 255 bytes), it is better
+ * to schedule a separate Ack. Thus we leave headroom for the following:
+ * - 1 byte for Slow Receiver (11.6)
+ * - 6 bytes for Timestamp (13.1)
+ * - 10 bytes for Timestamp Echo (13.3)
+ * - 8 bytes for NDP count (7.7, when activated)
+ * - 6 bytes for Data Checksum (9.3)
+ * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
*/
-
- cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+ cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+ (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
@@ -180,119 +197,197 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss);
void dccp_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
}
/**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * dccp_wait_for_ccid - Await CCID send permission
* @sk: socket to wait for
- * @skb: current skb to pass on for waiting
- * @delay: sleep timeout in milliseconds (> 0)
- * This function is called by default when the socket is closed, and
- * when a non-zero linger time is set on the socket. For consistency
+ * @delay: timeout in jiffies
+ *
+ * This is used by CCIDs which need to delay the send time in process context.
*/
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
{
- struct dccp_sock *dp = dccp_sk(sk);
DEFINE_WAIT(wait);
- unsigned long jiffdelay;
- int rc;
+ long remaining;
- do {
- dccp_pr_debug("delayed send by %d msec\n", delay);
- jiffdelay = msecs_to_jiffies(delay);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ release_sock(sk);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ remaining = schedule_timeout(delay);
- sk->sk_write_pending++;
- release_sock(sk);
- schedule_timeout(jiffdelay);
- lock_sock(sk);
- sk->sk_write_pending--;
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ finish_wait(sk_sleep(sk), &wait);
- if (sk->sk_err)
- goto do_error;
- if (signal_pending(current))
- goto do_interrupted;
+ if (signal_pending(current) || sk->sk_err)
+ return -1;
+ return remaining;
+}
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- } while ((delay = rc) > 0);
-out:
- finish_wait(sk->sk_sleep, &wait);
- return rc;
-
-do_error:
- rc = -EPIPE;
- goto out;
-do_interrupted:
- rc = -EINTR;
- goto out;
+/**
+ * dccp_xmit_packet - Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+ int err, len;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb = dccp_qpolicy_pop(sk);
+
+ if (unlikely(skb == NULL))
+ return;
+ len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+ /*
+ * See 8.1.5 - Handshake Completion.
+ *
+ * For robustness we resend Confirm options until the client has
+ * entered OPEN. During the initial feature negotiation, the MPS
+ * is smaller than usual, reduced by the Change/Confirm options.
+ */
+ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+ DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+ dccp_send_ack(sk);
+ dccp_feat_list_purge(&dp->dccps_featneg);
+ }
+
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else if (dccp_ack_pending(sk)) {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+ }
+
+ err = dccp_transmit_skb(sk, skb);
+ if (err)
+ dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+ /*
+ * Register this one as sent even if an error occurred. To the remote
+ * end a local packet drop is indistinguishable from network loss, i.e.
+ * any local drop will eventually be reported via receiver feedback.
+ */
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+ /*
+ * If the CCID needs to transfer additional header options out-of-band
+ * (e.g. Ack Vectors or feature-negotiation options), it activates this
+ * flag to schedule a Sync. The Sync will automatically incorporate all
+ * currently pending header options, thus clearing the backlog.
+ */
+ if (dp->dccps_sync_scheduled)
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
}
-void dccp_write_xmit(struct sock *sk, int block)
+/**
+ * dccp_flush_write_queue - Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
{
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
+ long delay, rc;
+
+ while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- while ((skb = skb_peek(&sk->sk_write_queue))) {
- int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- if (err > 0) {
- if (!block) {
- sk_reset_timer(sk, &dp->dccps_xmit_timer,
- msecs_to_jiffies(err)+jiffies);
- break;
- } else
- err = dccp_wait_for_ccid(sk, skb, err);
- if (err && err != -EINTR)
- DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ /*
+ * If the CCID determines when to send, the next sending
+ * time is unknown or the CCID may not even send again
+ * (e.g. remote host crashes or lost Ack packets).
+ */
+ DCCP_WARN("CCID did not manage to send all packets\n");
+ return;
+ case CCID_PACKET_DELAY:
+ delay = msecs_to_jiffies(rc);
+ if (delay > *time_budget)
+ return;
+ rc = dccp_wait_for_ccid(sk, delay);
+ if (rc < 0)
+ return;
+ *time_budget -= (delay - rc);
+ /* check again if we can send now */
+ break;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ skb_dequeue(&sk->sk_write_queue);
+ kfree_skb(skb);
+ dccp_pr_debug("packet discarded due to err=%ld\n", rc);
}
+ }
+}
+
+void dccp_write_xmit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
- skb_dequeue(&sk->sk_write_queue);
- if (err == 0) {
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int len = skb->len;
+ while ((skb = dccp_qpolicy_top(sk))) {
+ int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- if (sk->sk_state == DCCP_PARTOPEN) {
- /* See 8.1.5. Handshake Completion */
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk))
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- else
- dcb->dccpd_type = DCCP_PKT_DATA;
-
- err = dccp_transmit_skb(sk, skb);
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
- if (err)
- DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
- err);
- } else {
- dccp_pr_debug("packet discarded due to err=%d\n", err);
- kfree_skb(skb);
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ return;
+ case CCID_PACKET_DELAY:
+ sk_reset_timer(sk, &dp->dccps_xmit_timer,
+ jiffies + msecs_to_jiffies(rc));
+ return;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ dccp_qpolicy_drop(sk, skb);
+ dccp_pr_debug("packet discarded due to err=%d\n", rc);
}
}
}
-int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
+ * There are only four retransmittable packet types in DCCP:
+ * - Request in client-REQUEST state (sec. 8.1.1),
+ * - CloseReq in server-CLOSEREQ state (sec. 8.3),
+ * - Close in node-CLOSING state (sec. 8.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
+ * This function expects sk->sk_send_head to contain the original skb.
+ */
+int dccp_retransmit_skb(struct sock *sk)
{
+ WARN_ON(sk->sk_send_head == NULL);
+
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
return -EHOSTUNREACH; /* Routing failure or similar. */
- return dccp_transmit_skb(sk, (skb_cloned(skb) ?
- pskb_copy(skb, GFP_ATOMIC):
- skb_clone(skb, GFP_ATOMIC)));
+ /* this count is used to distinguish original and retransmitted skb */
+ inet_csk(sk)->icsk_retransmits++;
+
+ return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
}
struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
@@ -311,30 +406,32 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, sk->sk_prot->max_header);
- skb->dst = dst_clone(dst);
+ skb_dst_set(skb, dst_clone(dst));
dreq = dccp_rsk(req);
- if (inet_rsk(req)->acked) /* increase ISS upon retransmission */
- dccp_inc_seqno(&dreq->dreq_iss);
+ if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
+ dccp_inc_seqno(&dreq->dreq_gss);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
+ DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
- if (dccp_insert_options_rsk(dreq, skb)) {
- kfree_skb(skb);
- return NULL;
- }
+ /* Resolve feature dependencies resulting from choice of CCID */
+ if (dccp_feat_server_ccid_dependencies(dreq))
+ goto response_failed;
+
+ if (dccp_insert_options_rsk(dreq, skb))
+ goto response_failed;
/* Build and checksum header */
dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_sport = inet_sk(sk)->sport;
- dh->dccph_dport = inet_rsk(req)->rmt_port;
+ dh->dccph_sport = htons(inet_rsk(req)->ir_num);
+ dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
dh->dccph_doff = (dccp_header_size +
DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
dh->dccph_type = DCCP_PKT_RESPONSE;
dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_iss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
+ dccp_hdr_set_seq(dh, dreq->dreq_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
dccp_csum_outgoing(skb);
@@ -343,12 +440,15 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
inet_rsk(req)->acked = 1;
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
return skb;
+response_failed:
+ kfree_skb(skb);
+ return NULL;
}
EXPORT_SYMBOL_GPL(dccp_make_response);
/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
-struct sk_buff *dccp_ctl_make_reset(struct socket *ctl, struct sk_buff *rcv_skb)
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
{
struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
@@ -358,11 +458,11 @@ struct sk_buff *dccp_ctl_make_reset(struct socket *ctl, struct sk_buff *rcv_skb)
struct dccp_hdr_reset *dhr;
struct sk_buff *skb;
- skb = alloc_skb(ctl->sk->sk_prot->max_header, GFP_ATOMIC);
+ skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
if (skb == NULL)
return NULL;
- skb_reserve(skb, ctl->sk->sk_prot->max_header);
+ skb_reserve(skb, sk->sk_prot->max_header);
/* Swap the send and the receive. */
dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
@@ -427,8 +527,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
{
+ struct sk_buff *skb;
struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -438,31 +539,13 @@ static inline void dccp_connect_init(struct sock *sk)
dccp_sync_mss(sk, dst_mtu(dst));
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_update_gss(sk, dp->dccps_iss);
- dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
+ /* do not connect if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dccp_sk(sk)))
+ return -EPROTO;
- /* S.GAR - greatest valid acknowledgement number received on a non-Sync;
- * initialized to S.ISS (sec. 8.5) */
+ /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
dp->dccps_gar = dp->dccps_iss;
- icsk->icsk_retransmits = 0;
-}
-
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- dccp_connect_init(sk);
-
skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
if (unlikely(skb == NULL))
return -ENOBUFS;
@@ -472,11 +555,11 @@ int dccp_connect(struct sock *sk)
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
/* Timer for repeating the REQUEST until an answer. */
+ icsk->icsk_retransmits = 0;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, DCCP_RTO_MAX);
return 0;
@@ -509,6 +592,7 @@ void dccp_send_ack(struct sock *sk)
EXPORT_SYMBOL_GPL(dccp_send_ack);
+#if 0
/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
void dccp_send_delayed_ack(struct sock *sk)
{
@@ -539,6 +623,7 @@ void dccp_send_delayed_ack(struct sock *sk)
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
+#endif
void dccp_send_sync(struct sock *sk, const u64 ackno,
const enum dccp_pkt_type pkt_type)
@@ -561,6 +646,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+ /*
+ * Clear the flag in case the Sync was scheduled for out-of-band data,
+ * such as carrying a long Ack Vector.
+ */
+ dccp_sk(sk)->dccps_sync_scheduled = 0;
+
dccp_transmit_skb(sk, skb);
}
@@ -589,9 +680,7 @@ void dccp_send_close(struct sock *sk, const int active)
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
if (active) {
- dccp_write_xmit(sk, 1);
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, prio));
+ skb = dccp_skb_entail(sk, skb);
/*
* Retransmission timer for active-close: RFC 4340, 8.3 requires
* to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -604,6 +693,6 @@ void dccp_send_close(struct sock *sk, const int active)
*/
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
- } else
- dccp_transmit_skb(sk, skb);
+ }
+ dccp_transmit_skb(sk, skb);
}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 7053bb827bc..595ddf0459d 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/kfifo.h>
#include <linux/vmalloc.h>
+#include <linux/gfp.h>
#include <net/net_namespace.h>
#include "dccp.h"
@@ -42,67 +43,58 @@ static int bufsize = 64 * 1024;
static const char procname[] = "dccpprobe";
-struct {
- struct kfifo *fifo;
+static struct {
+ struct kfifo fifo;
spinlock_t lock;
wait_queue_head_t wait;
- struct timeval tstart;
+ struct timespec tstart;
} dccpw;
static void printl(const char *fmt, ...)
{
va_list args;
int len;
- struct timeval now;
+ struct timespec now;
char tbuf[256];
va_start(args, fmt);
- do_gettimeofday(&now);
+ getnstimeofday(&now);
- now.tv_sec -= dccpw.tstart.tv_sec;
- now.tv_usec -= dccpw.tstart.tv_usec;
- if (now.tv_usec < 0) {
- --now.tv_sec;
- now.tv_usec += 1000000;
- }
+ now = timespec_sub(now, dccpw.tstart);
len = sprintf(tbuf, "%lu.%06lu ",
(unsigned long) now.tv_sec,
- (unsigned long) now.tv_usec);
+ (unsigned long) now.tv_nsec / NSEC_PER_USEC);
len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
va_end(args);
- kfifo_put(dccpw.fifo, tbuf, len);
+ kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
wake_up(&dccpw.wait);
}
static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t size)
{
- const struct dccp_minisock *dmsk = dccp_msk(sk);
const struct inet_sock *inet = inet_sk(sk);
- const struct ccid3_hc_tx_sock *hctx;
-
- if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
- hctx = ccid3_hc_tx_sk(sk);
- else
- hctx = NULL;
-
- if (port == 0 || ntohs(inet->dport) == port ||
- ntohs(inet->sport) == port) {
- if (hctx)
- printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u "
- "%llu %llu %d\n",
- NIPQUAD(inet->saddr), ntohs(inet->sport),
- NIPQUAD(inet->daddr), ntohs(inet->dport), size,
- hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
- hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
- hctx->ccid3hctx_x_recv >> 6,
- hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
+ struct ccid3_hc_tx_sock *hc = NULL;
+
+ if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+ hc = ccid3_hc_tx_sk(sk);
+
+ if (port == 0 || ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port) {
+ if (hc)
+ printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport), size,
+ hc->tx_s, hc->tx_rtt, hc->tx_p,
+ hc->tx_x_calc, hc->tx_x_recv >> 6,
+ hc->tx_x >> 6, hc->tx_t_ipi);
else
- printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
- NIPQUAD(inet->saddr), ntohs(inet->sport),
- NIPQUAD(inet->daddr), ntohs(inet->dport), size);
+ printl("%pI4:%u %pI4:%u %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ size);
}
jprobe_return();
@@ -118,8 +110,8 @@ static struct jprobe dccp_send_probe = {
static int dccpprobe_open(struct inode *inode, struct file *file)
{
- kfifo_reset(dccpw.fifo);
- do_gettimeofday(&dccpw.tstart);
+ kfifo_reset(&dccpw.fifo);
+ getnstimeofday(&dccpw.tstart);
return 0;
}
@@ -140,12 +132,12 @@ static ssize_t dccpprobe_read(struct file *file, char __user *buf,
return -ENOMEM;
error = wait_event_interruptible(dccpw.wait,
- __kfifo_len(dccpw.fifo) != 0);
+ kfifo_len(&dccpw.fifo) != 0);
if (error)
goto out_free;
- cnt = kfifo_get(dccpw.fifo, tbuf, len);
- error = copy_to_user(buf, tbuf, cnt);
+ cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+ error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
out_free:
vfree(tbuf);
@@ -157,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
.owner = THIS_MODULE,
.open = dccpprobe_open,
.read = dccpprobe_read,
+ .llseek = noop_llseek,
};
static __init int dccpprobe_init(void)
@@ -165,31 +158,35 @@ static __init int dccpprobe_init(void)
init_waitqueue_head(&dccpw.wait);
spin_lock_init(&dccpw.lock);
- dccpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &dccpw.lock);
- if (IS_ERR(dccpw.fifo))
- return PTR_ERR(dccpw.fifo);
-
- if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops))
+ if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
+ return ret;
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
goto err0;
ret = register_jprobe(&dccp_send_probe);
+ if (ret) {
+ ret = request_module("dccp");
+ if (!ret)
+ ret = register_jprobe(&dccp_send_probe);
+ }
+
if (ret)
goto err1;
pr_info("DCCP watch registered (port=%d)\n", port);
return 0;
err1:
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
err0:
- kfifo_free(dccpw.fifo);
+ kfifo_free(&dccpw.fifo);
return ret;
}
module_init(dccpprobe_init);
static __exit void dccpprobe_exit(void)
{
- kfifo_free(dccpw.fifo);
- proc_net_remove(&init_net, procname);
+ kfifo_free(&dccpw.fifo);
+ remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&dccp_send_probe);
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e3f5d37b84b..de2c1e71930 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -20,6 +20,7 @@
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <net/checksum.h>
#include <net/inet_sock.h>
@@ -27,7 +28,6 @@
#include <net/xfrm.h>
#include <asm/ioctls.h>
-#include <asm/semaphore.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/delay.h>
@@ -41,21 +41,39 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
EXPORT_SYMBOL_GPL(dccp_statistics);
-atomic_t dccp_orphan_count = ATOMIC_INIT(0);
-
+struct percpu_counter dccp_orphan_count;
EXPORT_SYMBOL_GPL(dccp_orphan_count);
-struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
- .lhash_lock = RW_LOCK_UNLOCKED,
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-};
-
+struct inet_hashinfo dccp_hashinfo;
EXPORT_SYMBOL_GPL(dccp_hashinfo);
/* the maximum queue length for tx in packets. 0 is no limit */
int sysctl_dccp_tx_qlen __read_mostly = 5;
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+ static const char *const dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
+ [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+#endif
+
void dccp_set_state(struct sock *sk, const int state)
{
const int oldstate = sk->sk_state;
@@ -68,6 +86,9 @@ void dccp_set_state(struct sock *sk, const int state)
case DCCP_OPEN:
if (oldstate != DCCP_OPEN)
DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+ /* Client retransmits all Confirm options until entering OPEN */
+ if (oldstate == DCCP_PARTOPEN)
+ dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
break;
case DCCP_CLOSED:
@@ -128,7 +149,7 @@ EXPORT_SYMBOL_GPL(dccp_done);
const char *dccp_packet_name(const int type)
{
- static const char *dccp_packet_names[] = {
+ static const char *const dccp_packet_names[] = {
[DCCP_PKT_REQUEST] = "REQUEST",
[DCCP_PKT_RESPONSE] = "RESPONSE",
[DCCP_PKT_DATA] = "DATA",
@@ -149,38 +170,11 @@ const char *dccp_packet_name(const int type)
EXPORT_SYMBOL_GPL(dccp_packet_name);
-const char *dccp_state_name(const int state)
-{
- static char *dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
- [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- dccp_minisock_init(&dp->dccps_minisock);
-
icsk->icsk_rto = DCCP_TIMEOUT_INIT;
icsk->icsk_syn_retries = sysctl_dccp_request_retries;
sk->sk_state = DCCP_CLOSED;
@@ -190,58 +184,22 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
dp->dccps_rate_last = jiffies;
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
+ dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
dccp_init_xmit_timers(sk);
- /*
- * FIXME: We're hardcoding the CCID, and doing this at this point makes
- * the listening (master) sock get CCID control blocks, which is not
- * necessary, but for now, to not mess with the test userspace apps,
- * lets leave it here, later the real solution is to do this in a
- * setsockopt(CCIDs-I-want/accept). -acme
- */
- if (likely(ctl_sock_initialized)) {
- int rc = dccp_feat_init(dmsk);
-
- if (rc)
- return rc;
-
- if (dmsk->dccpms_send_ack_vector) {
- dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
- if (dp->dccps_hc_rx_ackvec == NULL)
- return -ENOMEM;
- }
- dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
- sk, GFP_KERNEL);
- dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
- sk, GFP_KERNEL);
- if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
- dp->dccps_hc_tx_ccid == NULL)) {
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- if (dmsk->dccpms_send_ack_vector) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
- return -ENOMEM;
- }
- } else {
- /* control socket doesn't need feat nego */
- INIT_LIST_HEAD(&dmsk->dccpms_pending);
- INIT_LIST_HEAD(&dmsk->dccpms_conf);
- }
-
+ INIT_LIST_HEAD(&dp->dccps_featneg);
+ /* control socket doesn't need feat nego */
+ if (likely(ctl_sock_initialized))
+ return dccp_feat_init(sk);
return 0;
}
EXPORT_SYMBOL_GPL(dccp_init_sock);
-int dccp_destroy_sock(struct sock *sk)
+void dccp_destroy_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_minisock *dmsk = dccp_msk(sk);
/*
* DCCP doesn't use sk_write_queue, just sk_send_head
@@ -259,7 +217,7 @@ int dccp_destroy_sock(struct sock *sk)
kfree(dp->dccps_service_list);
dp->dccps_service_list = NULL;
- if (dmsk->dccpms_send_ack_vector) {
+ if (dp->dccps_hc_rx_ackvec != NULL) {
dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
dp->dccps_hc_rx_ackvec = NULL;
}
@@ -268,9 +226,7 @@ int dccp_destroy_sock(struct sock *sk)
dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
/* clean up feature negotiation state */
- dccp_feat_clean(dmsk);
-
- return 0;
+ dccp_feat_list_purge(&dp->dccps_featneg);
}
EXPORT_SYMBOL_GPL(dccp_destroy_sock);
@@ -280,6 +236,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
struct dccp_sock *dp = dccp_sk(sk);
dp->dccps_role = DCCP_ROLE_LISTEN;
+ /* do not start to listen if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dp))
+ return -EPROTO;
return inet_csk_listen_start(sk, backlog);
}
@@ -312,13 +271,15 @@ int dccp_disconnect(struct sock *sk, int flags)
sk->sk_err = ECONNRESET;
dccp_clear_xmit_timers(sk);
+
__skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_write_queue);
if (sk->sk_send_head != NULL) {
__kfree_skb(sk->sk_send_head);
sk->sk_send_head = NULL;
}
- inet->dport = 0;
+ inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -330,7 +291,7 @@ int dccp_disconnect(struct sock *sk, int flags)
inet_csk_delack_init(sk);
__sk_dst_reset(sk);
- BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
@@ -351,7 +312,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
unsigned int mask;
struct sock *sk = sock->sk;
- poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -375,7 +336,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
@@ -386,7 +347,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
* wspace test but before the flags are set,
* IO signal will be lost.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
}
@@ -433,7 +394,7 @@ out:
EXPORT_SYMBOL_GPL(dccp_ioctl);
static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_service_list *sl = NULL;
@@ -467,46 +428,88 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
return 0;
}
-/* byte 1 is feature. the rest is the preference list */
-static int dccp_setsockopt_change(struct sock *sk, int type,
- struct dccp_so_feat __user *optval)
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
{
- struct dccp_so_feat opt;
- u8 *val;
- int rc;
+ u8 *list, len;
+ int i, rc;
- if (copy_from_user(&opt, optval, sizeof(opt)))
- return -EFAULT;
+ if (cscov < 0 || cscov > 15)
+ return -EINVAL;
+ /*
+ * Populate a list of permissible values, in the range cscov...15. This
+ * is necessary since feature negotiation of single values only works if
+ * both sides incidentally choose the same value. Since the list starts
+ * lowest-value first, negotiation will pick the smallest shared value.
+ */
+ if (cscov == 0)
+ return 0;
+ len = 16 - cscov;
- val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
- if (!val)
- return -ENOMEM;
+ list = kmalloc(len, GFP_KERNEL);
+ if (list == NULL)
+ return -ENOBUFS;
- if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
- rc = -EFAULT;
- goto out_free_val;
- }
+ for (i = 0; i < len; i++)
+ list[i] = cscov++;
- rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
- val, opt.dccpsf_len, GFP_KERNEL);
- if (rc)
- goto out_free_val;
+ rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
-out:
+ if (rc == 0) {
+ if (rx)
+ dccp_sk(sk)->dccps_pcrlen = cscov;
+ else
+ dccp_sk(sk)->dccps_pcslen = cscov;
+ }
+ kfree(list);
return rc;
+}
+
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+ char __user *optval, unsigned int optlen)
+{
+ u8 *val;
+ int rc = 0;
+
+ if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+ return -EINVAL;
+
+ val = memdup_user(optval, optlen);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+
+ lock_sock(sk);
+ if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+ if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+ release_sock(sk);
-out_free_val:
kfree(val);
- goto out;
+ return rc;
}
static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
struct dccp_sock *dp = dccp_sk(sk);
int val, err = 0;
- if (optlen < sizeof(int))
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CHANGE_L:
+ case DCCP_SOCKOPT_CHANGE_R:
+ DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CCID:
+ case DCCP_SOCKOPT_RX_CCID:
+ case DCCP_SOCKOPT_TX_CCID:
+ return dccp_setsockopt_ccid(sk, optname, optval, optlen);
+ }
+
+ if (optlen < (int)sizeof(int))
return -EINVAL;
if (get_user(val, (int __user *)optval))
@@ -517,58 +520,43 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
lock_sock(sk);
switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- err = 0;
- break;
- case DCCP_SOCKOPT_CHANGE_L:
- if (optlen != sizeof(struct dccp_so_feat))
- err = -EINVAL;
- else
- err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
- (struct dccp_so_feat __user *)
- optval);
- break;
- case DCCP_SOCKOPT_CHANGE_R:
- if (optlen != sizeof(struct dccp_so_feat))
- err = -EINVAL;
- else
- err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
- (struct dccp_so_feat __user *)
- optval);
- break;
case DCCP_SOCKOPT_SERVER_TIMEWAIT:
if (dp->dccps_role != DCCP_ROLE_SERVER)
err = -EOPNOTSUPP;
else
dp->dccps_server_timewait = (val != 0);
break;
- case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */
- if (val < 0 || val > 15)
+ case DCCP_SOCKOPT_SEND_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, false);
+ break;
+ case DCCP_SOCKOPT_RECV_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, true);
+ break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ if (sk->sk_state != DCCP_CLOSED)
+ err = -EISCONN;
+ else if (val < 0 || val >= DCCPQ_POLICY_MAX)
err = -EINVAL;
else
- dp->dccps_pcslen = val;
+ dp->dccps_qpolicy = val;
break;
- case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */
- if (val < 0 || val > 15)
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ if (val < 0)
err = -EINVAL;
- else {
- dp->dccps_pcrlen = val;
- /* FIXME: add feature negotiation,
- * ChangeL(MinimumChecksumCoverage, val) */
- }
+ else
+ dp->dccps_tx_qlen = val;
break;
default:
err = -ENOPROTOOPT;
break;
}
-
release_sock(sk);
+
return err;
}
int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
if (level != SOL_DCCP)
return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
@@ -581,7 +569,7 @@ EXPORT_SYMBOL_GPL(dccp_setsockopt);
#ifdef CONFIG_COMPAT
int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
if (level != SOL_DCCP)
return inet_csk_compat_setsockopt(sk, level, optname,
@@ -644,6 +632,18 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_GET_CUR_MPS:
val = dp->dccps_mss_cache;
break;
+ case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+ return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+ case DCCP_SOCKOPT_TX_CCID:
+ val = ccid_get_current_tx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
+ case DCCP_SOCKOPT_RX_CCID:
+ val = ccid_get_current_rx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
case DCCP_SOCKOPT_SERVER_TIMEWAIT:
val = dp->dccps_server_timewait;
break;
@@ -653,6 +653,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_RECV_CSCOV:
val = dp->dccps_pcrlen;
break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ val = dp->dccps_qpolicy;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ val = dp->dccps_tx_qlen;
+ break;
case 128 ... 191:
return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
len, (u32 __user *)optval, optlen);
@@ -695,6 +701,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
#endif
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+ /*
+ * Assign an (opaque) qpolicy priority value to skb->priority.
+ *
+ * We are overloading this skb field for use with the qpolicy subystem.
+ * The skb->priority is normally used for the SO_PRIORITY option, which
+ * is initialised from sk_priority. Since the assignment of sk_priority
+ * to skb->priority happens later (on layer 3), we overload this field
+ * for use with queueing priorities as long as the skb is on layer 4.
+ * The default priority value (if nothing is set) is 0.
+ */
+ skb->priority = 0;
+
+ for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_DCCP)
+ continue;
+
+ if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+ !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+ return -EINVAL;
+
+ switch (cmsg->cmsg_type) {
+ case DCCP_SCM_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+ return -EINVAL;
+ skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -710,8 +757,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);
- if (sysctl_dccp_tx_qlen &&
- (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+ if (dccp_qpolicy_full(sk)) {
rc = -EAGAIN;
goto out_release;
}
@@ -739,8 +785,18 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (rc != 0)
goto out_discard;
- skb_queue_tail(&sk->sk_write_queue, skb);
- dccp_write_xmit(sk,0);
+ rc = dccp_msghdr_parse(msg, skb);
+ if (rc != 0)
+ goto out_discard;
+
+ dccp_qpolicy_push(sk, skb);
+ /*
+ * The xmit_timer is set if the TX CCID is rate-based and will expire
+ * when congestion control permits to release further packets into the
+ * network. Window-based CCIDs do not use this timer.
+ */
+ if (!timer_pending(&dp->dccps_xmit_timer))
+ dccp_write_xmit(sk);
out_release:
release_sock(sk);
return rc ? : len;
@@ -792,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
default:
dccp_pr_debug("packet_type=%s\n",
dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
}
verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
@@ -845,9 +901,11 @@ verify_sock_status:
len = -EFAULT;
break;
}
+ if (flags & MSG_TRUNC)
+ len = skb->len;
found_fin_ok:
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
break;
} while (1);
out:
@@ -956,23 +1014,35 @@ void dccp_close(struct sock *sk, long timeout)
if (data_was_unread) {
/* Unread data was tossed, send an appropriate Reset Code */
- DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
+ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_set_state(sk, DCCP_CLOSED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
} else if (sk->sk_state != DCCP_CLOSED) {
+ /*
+ * Normal connection termination. May need to wait if there are
+ * still packets in the TX queue that are delayed by the CCID.
+ */
+ dccp_flush_write_queue(sk, &timeout);
dccp_terminate_connection(sk);
}
+ /*
+ * Flush write queue. This may be necessary in several cases:
+ * - we have been closed by the peer but still have application data;
+ * - abortive termination (unread data or zero linger time),
+ * - normal termination but queue could not be flushed within time limit
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
sk_stream_wait_close(sk, timeout);
adjudge_to_death:
state = sk->sk_state;
sock_hold(sk);
sock_orphan(sk);
- atomic_inc(sk->sk_prot->orphan_count);
/*
* It is the last release_sock in its life. It will remove backlog.
@@ -984,7 +1054,9 @@ adjudge_to_death:
*/
local_bh_disable();
bh_lock_sock(sk);
- BUG_TRAP(!sock_owned_by_user(sk));
+ WARN_ON(sock_owned_by_user(sk));
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
/* Have we already been destroyed by a softirq or backlog? */
if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
@@ -1010,33 +1082,17 @@ void dccp_shutdown(struct sock *sk, int how)
EXPORT_SYMBOL_GPL(dccp_shutdown);
-static int __init dccp_mib_init(void)
+static inline int dccp_mib_init(void)
{
- int rc = -ENOMEM;
-
- dccp_statistics[0] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[0] == NULL)
- goto out;
-
- dccp_statistics[1] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[1] == NULL)
- goto out_free_one;
-
- rc = 0;
-out:
- return rc;
-out_free_one:
- free_percpu(dccp_statistics[0]);
- dccp_statistics[0] = NULL;
- goto out;
-
+ dccp_statistics = alloc_percpu(struct dccp_mib);
+ if (!dccp_statistics)
+ return -ENOMEM;
+ return 0;
}
-static void dccp_mib_exit(void)
+static inline void dccp_mib_exit(void)
{
- free_percpu(dccp_statistics[0]);
- free_percpu(dccp_statistics[1]);
- dccp_statistics[0] = dccp_statistics[1] = NULL;
+ free_percpu(dccp_statistics);
}
static int thash_entries;
@@ -1044,8 +1100,8 @@ module_param(thash_entries, int, 0444);
MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
#ifdef CONFIG_IP_DCCP_DEBUG
-int dccp_debug;
-module_param(dccp_debug, bool, 0444);
+bool dccp_debug;
+module_param(dccp_debug, bool, 0644);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
EXPORT_SYMBOL_GPL(dccp_debug);
@@ -1055,14 +1111,21 @@ static int __init dccp_init(void)
{
unsigned long goal;
int ehash_order, bhash_order, i;
- int rc = -ENOBUFS;
+ int rc;
+ BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
+ rc = percpu_counter_init(&dccp_orphan_count, 0);
+ if (rc)
+ goto out_fail;
+ rc = -ENOBUFS;
+ inet_hashinfo_init(&dccp_hashinfo);
dccp_hashinfo.bind_bucket_cachep =
kmem_cache_create("dccp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
- goto out;
+ goto out_free_percpu;
/*
* Size and allocate the main established and bind bucket
@@ -1070,10 +1133,10 @@ static int __init dccp_init(void)
*
* The methodology is similar to that of the buffer cache.
*/
- if (num_physpages >= (128 * 1024))
- goal = num_physpages >> (21 - PAGE_SHIFT);
+ if (totalram_pages >= (128 * 1024))
+ goal = totalram_pages >> (21 - PAGE_SHIFT);
else
- goal = num_physpages >> (23 - PAGE_SHIFT);
+ goal = totalram_pages >> (23 - PAGE_SHIFT);
if (thash_entries)
goal = (thash_entries *
@@ -1081,13 +1144,14 @@ static int __init dccp_init(void)
for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
;
do {
- dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+ unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
sizeof(struct inet_ehash_bucket);
- while (dccp_hashinfo.ehash_size &
- (dccp_hashinfo.ehash_size - 1))
- dccp_hashinfo.ehash_size--;
+
+ while (hash_size & (hash_size - 1))
+ hash_size--;
+ dccp_hashinfo.ehash_mask = hash_size - 1;
dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
- __get_free_pages(GFP_ATOMIC, ehash_order);
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
} while (!dccp_hashinfo.ehash && --ehash_order > 0);
if (!dccp_hashinfo.ehash) {
@@ -1095,10 +1159,8 @@ static int __init dccp_init(void)
goto out_free_bind_bucket_cachep;
}
- for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
- INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
- INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
- }
+ for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
if (inet_ehash_locks_alloc(&dccp_hashinfo))
goto out_free_dccp_ehash;
@@ -1112,7 +1174,7 @@ static int __init dccp_init(void)
bhash_order > 0)
continue;
dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC, bhash_order);
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
if (!dccp_hashinfo.bhash) {
@@ -1137,40 +1199,52 @@ static int __init dccp_init(void)
if (rc)
goto out_ackvec_exit;
+ rc = ccid_initialize_builtins();
+ if (rc)
+ goto out_sysctl_exit;
+
dccp_timestamping_init();
-out:
- return rc;
+
+ return 0;
+
+out_sysctl_exit:
+ dccp_sysctl_exit();
out_ackvec_exit:
dccp_ackvec_exit();
out_free_dccp_mib:
dccp_mib_exit();
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
- dccp_hashinfo.bhash = NULL;
out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
- dccp_hashinfo.ehash = NULL;
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+out_free_percpu:
+ percpu_counter_destroy(&dccp_orphan_count);
+out_fail:
+ dccp_hashinfo.bhash = NULL;
+ dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
- goto out;
+ return rc;
}
static void __exit dccp_fini(void)
{
+ ccid_cleanup_builtins();
dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash,
get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.ehash,
- get_order(dccp_hashinfo.ehash_size *
+ get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));
inet_ehash_locks_free(&dccp_hashinfo);
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
dccp_ackvec_exit();
dccp_sysctl_exit();
+ percpu_counter_destroy(&dccp_orphan_count);
}
module_init(dccp_init);
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000000..63c30bfa470
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/qpolicy.c
+ *
+ * Policy-based packet dequeueing interface for DCCP.
+ *
+ * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ * Simple Dequeueing Policy:
+ * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+ skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_tx_qlen &&
+ sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+ return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ * Priority-based Dequeueing Policy:
+ * If tx_qlen is different from 0 and the queue has reached its upper bound
+ * of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *best = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (best == NULL || skb->priority > best->priority)
+ best = skb;
+ return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *worst = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (worst == NULL || skb->priority < worst->priority)
+ worst = skb;
+ return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+ if (qpolicy_simple_full(sk))
+ dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+ return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top: peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+ void (*push) (struct sock *sk, struct sk_buff *skb);
+ bool (*full) (struct sock *sk);
+ struct sk_buff* (*top) (struct sock *sk);
+ __be32 params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+ [DCCPQ_POLICY_SIMPLE] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_simple_full,
+ .top = qpolicy_simple_top,
+ .params = 0,
+ },
+ [DCCPQ_POLICY_PRIO] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_prio_full,
+ .top = qpolicy_prio_best_skb,
+ .params = DCCP_SCM_PRIORITY,
+ },
+};
+
+/*
+ * Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+ qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ skb_unlink(skb, &sk->sk_write_queue);
+ kfree_skb(skb);
+ }
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+ struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+ if (skb != NULL) {
+ /* Clear any skb fields that we used internally */
+ skb->priority = 0;
+ skb_unlink(skb, &sk->sk_write_queue);
+ }
+ return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+ /* check if exactly one bit is set */
+ if (!param || (param & (param - 1)))
+ return false;
+ return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 21295993fdb..53731e45403 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,76 +18,75 @@
#error This file should not be compiled without CONFIG_SYSCTL defined
#endif
+/* Boundary values */
+static int zero = 0,
+ one = 1,
+ u8_max = 0xFF;
+static unsigned long seqw_min = DCCPF_SEQ_WMIN,
+ seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
+
static struct ctl_table dccp_default_table[] = {
{
.procname = "seq_window",
- .data = &sysctl_dccp_feat_sequence_window,
- .maxlen = sizeof(sysctl_dccp_feat_sequence_window),
+ .data = &sysctl_dccp_sequence_window,
+ .maxlen = sizeof(sysctl_dccp_sequence_window),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
+ .extra2 = &seqw_max,
},
{
.procname = "rx_ccid",
- .data = &sysctl_dccp_feat_rx_ccid,
- .maxlen = sizeof(sysctl_dccp_feat_rx_ccid),
+ .data = &sysctl_dccp_rx_ccid,
+ .maxlen = sizeof(sysctl_dccp_rx_ccid),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
},
{
.procname = "tx_ccid",
- .data = &sysctl_dccp_feat_tx_ccid,
- .maxlen = sizeof(sysctl_dccp_feat_tx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "ack_ratio",
- .data = &sysctl_dccp_feat_ack_ratio,
- .maxlen = sizeof(sysctl_dccp_feat_ack_ratio),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "send_ackvec",
- .data = &sysctl_dccp_feat_send_ack_vector,
- .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "send_ndp",
- .data = &sysctl_dccp_feat_send_ndp_count,
- .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count),
+ .data = &sysctl_dccp_tx_ccid,
+ .maxlen = sizeof(sysctl_dccp_tx_ccid),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
},
{
.procname = "request_retries",
.data = &sysctl_dccp_request_retries,
.maxlen = sizeof(sysctl_dccp_request_retries),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &u8_max,
},
{
.procname = "retries1",
.data = &sysctl_dccp_retries1,
.maxlen = sizeof(sysctl_dccp_retries1),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
},
{
.procname = "retries2",
.data = &sysctl_dccp_retries2,
.maxlen = sizeof(sysctl_dccp_retries2),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
},
{
.procname = "tx_qlen",
.data = &sysctl_dccp_tx_qlen,
.maxlen = sizeof(sysctl_dccp_tx_qlen),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "sync_ratelimit",
@@ -97,13 +96,6 @@ static struct ctl_table dccp_default_table[] = {
.proc_handler = proc_dointvec_ms_jiffies,
},
- { .ctl_name = 0, }
-};
-
-static struct ctl_path dccp_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "dccp", .ctl_name = NET_DCCP, },
- { .procname = "default", .ctl_name = NET_DCCP_DEFAULT, },
{ }
};
@@ -111,7 +103,7 @@ static struct ctl_table_header *dccp_table_header;
int __init dccp_sysctl_init(void)
{
- dccp_table_header = register_sysctl_paths(dccp_path,
+ dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
dccp_default_table);
return dccp_table_header != NULL ? 0 : -ENOMEM;
@@ -120,7 +112,7 @@ int __init dccp_sysctl_init(void)
void dccp_sysctl_exit(void)
{
if (dccp_table_header != NULL) {
- unregister_sysctl_table(dccp_table_header);
+ unregister_net_sysctl_table(dccp_table_header);
dccp_table_header = NULL;
}
}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 8703a792b56..1cd46a345cb 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -12,6 +12,7 @@
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
#include "dccp.h"
@@ -38,7 +39,7 @@ static int dccp_write_timeout(struct sock *sk)
if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
if (icsk->icsk_retransmits != 0)
- dst_negative_advice(&sk->sk_dst_cache);
+ dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ?
: sysctl_dccp_request_retries;
} else {
@@ -63,7 +64,7 @@ static int dccp_write_timeout(struct sock *sk)
Golden words :-).
*/
- dst_negative_advice(&sk->sk_dst_cache);
+ dst_negative_advice(sk);
}
retry_until = sysctl_dccp_retries2;
@@ -87,33 +88,12 @@ static void dccp_retransmit_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- /* retransmit timer is used for feature negotiation throughout
- * connection. In this case, no packet is re-transmitted, but rather an
- * ack is generated and pending changes are placed into its options.
- */
- if (sk->sk_send_head == NULL) {
- dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
- if (sk->sk_state == DCCP_OPEN)
- dccp_send_ack(sk);
- goto backoff;
- }
-
- /*
- * sk->sk_send_head has to have one skb with
- * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
- * packet types. The only packets eligible for retransmission are:
- * -- Requests in client-REQUEST state (sec. 8.1.1)
- * -- Acks in client-PARTOPEN state (sec. 8.1.5)
- * -- CloseReq in server-CLOSEREQ state (sec. 8.3)
- * -- Close in node-CLOSING state (sec. 8.3) */
- BUG_TRAP(sk->sk_send_head != NULL);
-
/*
* More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
* sent, no need to retransmit, this sock is dead.
*/
if (dccp_write_timeout(sk))
- goto out;
+ return;
/*
* We want to know the number of packets retransmitted, not the
@@ -122,30 +102,27 @@ static void dccp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0)
DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
- if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+ if (dccp_retransmit_skb(sk) != 0) {
/*
* Retransmission failed because of local congestion,
* do not backoff.
*/
- if (icsk->icsk_retransmits == 0)
+ if (--icsk->icsk_retransmits == 0)
icsk->icsk_retransmits = 1;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
min(icsk->icsk_rto,
TCP_RESOURCE_PROBE_INTERVAL),
DCCP_RTO_MAX);
- goto out;
+ return;
}
-backoff:
icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
DCCP_RTO_MAX);
if (icsk->icsk_retransmits > sysctl_dccp_retries1)
__sk_dst_reset(sk);
-out:;
}
static void dccp_write_timer(unsigned long data)
@@ -224,7 +201,7 @@ static void dccp_delack_timer(unsigned long data)
if (sock_owned_by_user(sk)) {
/* Try again later. */
icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
sk_reset_timer(sk, &icsk->icsk_delack_timer,
jiffies + TCP_DELACK_MIN);
goto out;
@@ -254,39 +231,42 @@ static void dccp_delack_timer(unsigned long data)
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
dccp_send_ack(sk);
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-/* Transmit-delay timer: used by the CCIDs to delay actual send time */
-static void dccp_write_xmit_timer(unsigned long data)
+/**
+ * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
{
struct sock *sk = (struct sock *)data;
- struct dccp_sock *dp = dccp_sk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
+ sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
else
- dccp_write_xmit(sk, 0);
+ dccp_write_xmit(sk);
bh_unlock_sock(sk);
- sock_put(sk);
}
-static void dccp_init_write_xmit_timer(struct sock *sk)
+static void dccp_write_xmit_timer(unsigned long data)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
- setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
- (unsigned long)sk);
+ dccp_write_xmitlet(data);
+ sock_put((struct sock *)data);
}
void dccp_init_xmit_timers(struct sock *sk)
{
- dccp_init_write_xmit_timer(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+ setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+ (unsigned long)sk);
inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
&dccp_keepalive_timer);
}
@@ -300,7 +280,7 @@ static ktime_t dccp_timestamp_seed;
*/
u32 dccp_timestamp(void)
{
- s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
+ u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
do_div(delta, 10);
return delta;